From 4e75934e151617d7aba31058ad721c21a5f29e6e Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 24 Oct 2024 15:39:38 +0200 Subject: [PATCH 01/32] Poc: cost attribution proposal 1.2 --- cmd/mimir/config-descriptor.json | 55 ++++++ cmd/mimir/help-all.txt.tmpl | 10 ++ pkg/costattribution/manager.go | 159 ++++++++++++++++ pkg/costattribution/tracker.go | 200 +++++++++++++++++++++ pkg/distributor/distributor.go | 32 +++- pkg/distributor/validate.go | 59 +++++- pkg/ingester/activeseries/active_series.go | 97 +++++++--- pkg/ingester/ingester.go | 105 +++++++++-- pkg/ingester/user_tsdb.go | 4 +- pkg/mimir/mimir.go | 30 ++-- pkg/mimir/modules.go | 28 ++- pkg/streamingpromql/benchmarks/ingester.go | 2 +- pkg/util/validation/limits.go | 16 +- 13 files changed, 730 insertions(+), 67 deletions(-) create mode 100644 pkg/costattribution/manager.go create mode 100644 pkg/costattribution/tracker.go diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index d40e91c862f..e05a43e7932 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4347,6 +4347,28 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "cost_attribution_labels", + "required": false, + "desc": "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "validation.cost-attribution-labels", + "fieldType": "string", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_per_user", + "required": false, + "desc": "Maximum number of cost attribution labels allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "validation.max-cost-attribution-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "ruler_evaluation_delay_duration", @@ -18346,6 +18368,17 @@ "fieldValue": null, "fieldDefaultValue": null }, + { + "kind": "field", + "name": "custom_registry_path", + "required": false, + "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "custom-registry-path", + "fieldType": "string", + "fieldCategory": "advanced" + }, { "kind": "field", "name": "timeseries_unmarshal_caching_optimization_enabled", @@ -18356,6 +18389,28 @@ "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled", "fieldType": "boolean", "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_eviction_interval", + "required": false, + "desc": "Time interval at which inactive cost attributions will be evicted from the cache.", + "fieldValue": null, + "fieldDefaultValue": 1800000000000, + "fieldFlag": "cost-attribution-eviction-interval", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_cool_down_duration", + "required": false, + "desc": "Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache.", + "fieldValue": null, + "fieldDefaultValue": 1200000000000, + "fieldFlag": "cost-attribution-cool-down-duration", + "fieldType": "duration", + "fieldCategory": "experimental" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 64323dbf111..9977eed5e61 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1139,6 +1139,12 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. + -cost-attribution-cool-down-duration duration + [experimental] Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache. (default 20m0s) + -cost-attribution-eviction-interval duration + [experimental] Time interval at which inactive cost attributions will be evicted from the cache. (default 30m0s) + -custom-registry-path string + Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. -debug.mutex-profile-fraction int @@ -3097,10 +3103,14 @@ Usage of ./cmd/mimir/mimir: Enable anonymous usage reporting. (default true) -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") + -validation.cost-attribution-labels comma-separated-list-of-strings + [experimental] List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) + -validation.max-cost-attribution-per-user int + [experimental] Maximum number of cost attribution labels allowed per user. -validation.max-label-names-per-series int Maximum number of label names per series. (default 30) -validation.max-length-label-name int diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go new file mode 100644 index 00000000000..cda08a6eaa3 --- /dev/null +++ b/pkg/costattribution/manager.go @@ -0,0 +1,159 @@ +package costattribution + +import ( + "context" + "sort" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/grafana/dskit/services" + "github.com/prometheus/client_golang/prometheus" + + "github.com/grafana/mimir/pkg/util/validation" +) + +const ( + missingValue = "__missing__" + overflowValue = "__overflow__" +) + +type Manager struct { + services.Service + logger log.Logger + inactiveTimeout time.Duration + limits *validation.Overrides + cooldownTimeout time.Duration + + // mu protects the trackersByUserID map + tlock sync.RWMutex + trackersByUserID map[string]*Tracker +} + +// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series. +// It will clean up inactive series and update the cost attribution of series every 3 minutes. +func NewManager(cleanupInterval, inactiveTimeout time.Duration, cooldownTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager { + s := &Manager{ + trackersByUserID: make(map[string]*Tracker), + limits: limits, + tlock: sync.RWMutex{}, + cooldownTimeout: cooldownTimeout, + inactiveTimeout: inactiveTimeout, + logger: logger, + } + + s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager") + return s +} + +func (m *Manager) iteration(_ context.Context) error { + m.purgeInactiveAttributions(m.inactiveTimeout) + return nil +} + +// EnabledForUser returns true if the cost attribution is enabled for the user +func (m *Manager) EnabledForUser(userID string) bool { + return len(m.limits.CostAttributionLabel(userID)) > 0 +} + +func (m *Manager) TrackerForUser(userID string) *Tracker { + // if cost attribution is not enabled, return nil + if !m.EnabledForUser(userID) { + return nil + } + m.tlock.Lock() + defer m.tlock.Unlock() + + // if not exists, create a new tracker + if _, exists := m.trackersByUserID[userID]; !exists { + m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID)) + } + return m.trackersByUserID[userID] +} + +func (m *Manager) Collect(out chan<- prometheus.Metric) { + m.tlock.RLock() + defer m.tlock.RUnlock() + for _, tracker := range m.trackersByUserID { + tracker.Collect(out) + } +} + +// Describe implements prometheus.Collector. +func (m *Manager) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector +} + +// deleteUserTracer is delete user tracker since the user is disabled for cost attribution +func (m *Manager) deleteUserTracer(userID string) { + m.tlock.Lock() + defer m.tlock.Unlock() + if _, exists := m.trackersByUserID[userID]; !exists { + return + } + // clean up tracker metrics and delete the tracker + m.trackersByUserID[userID].cleanupTracker(userID) + delete(m.trackersByUserID, userID) +} + +func (m *Manager) purgeInactiveAttributions(inactiveTimeout time.Duration) { + + // Get all userIDs from the map + m.tlock.RLock() + userIDs := make([]string, 0, len(m.trackersByUserID)) + for userID := range m.trackersByUserID { + userIDs = append(userIDs, userID) + } + m.tlock.RUnlock() + + // Iterate over all userIDs and purge inactive attributions of each user + currentTime := time.Now() + for _, userID := range userIDs { + // if cost attribution is not enabled for the user, delete the user tracker and continue + if len(m.limits.CostAttributionLabel(userID)) == 0 || m.limits.MaxCostAttributionPerUser(userID) <= 0 { + m.deleteUserTracer(userID) + continue + } + // get all inactive attributions for the user and clean up the tracker + inactiveObs := m.purgeInactiveObservationsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano()) + + for _, ob := range inactiveObs { + m.trackersByUserID[userID].cleanupTrackerAttribution(ob.lvalues) + } + } +} + +// compare two sorted string slices +func compareStringSlice(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i, v := range a { + if v != b[i] { + return false + } + } + return true +} + +func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*observation { + cat := m.TrackerForUser(userID) + if cat == nil { + return nil + } + + newTrackedLabels := sort.StringSlice(m.limits.CostAttributionLabel(userID)) + // if they are different, we need to update the tracker, we don't mind, just reinitalized the tracker + if !compareStringSlice(cat.trackedLabels, newTrackedLabels) { + m.tlock.Lock() + m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID)) + // update the tracker with the new tracker + cat = m.trackersByUserID[userID] + m.tlock.Unlock() + } else if maxCardinality := m.limits.MaxCostAttributionPerUser(userID); cat.maxCardinality != maxCardinality { + // if the maxCardinality is different, update the tracker + cat.updateMaxCardinality(maxCardinality) + } + + return cat.PurgeInactiveObservations(deadline) +} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go new file mode 100644 index 00000000000..d9f61cbda93 --- /dev/null +++ b/pkg/costattribution/tracker.go @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "sort" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/labels" + "go.uber.org/atomic" +) + +type observation struct { + lvalues []string + lastUpdate *atomic.Int64 +} + +func (t *Tracker) cleanupTrackerAttribution(vals []string) { + t.activeSeriesPerUserAttribution.DeleteLabelValues(vals...) + t.receivedSamplesAttribution.DeleteLabelValues(vals...) + t.discardedSampleAttribution.DeleteLabelValues(vals...) +} + +func (t *Tracker) cleanupTracker(userID string) { + filter := prometheus.Labels{"user": userID} + t.activeSeriesPerUserAttribution.DeletePartialMatch(filter) + t.receivedSamplesAttribution.DeletePartialMatch(filter) + t.discardedSampleAttribution.DeletePartialMatch(filter) +} + +type Tracker struct { + userID string + trackedLabels []string + maxCardinality int + activeSeriesPerUserAttribution *prometheus.GaugeVec + receivedSamplesAttribution *prometheus.CounterVec + discardedSampleAttribution *prometheus.CounterVec + + // oLock protects the observed map + oLock sync.RWMutex + observed map[uint64]*observation + + hashBuffer []byte +} + +func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { + vals := t.getKeyValues(lbs, now.Unix()) + t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() +} + +func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { + vals := t.getKeyValues(lbs, now.Unix()) + t.discardedSampleAttribution.WithLabelValues(vals...).Add(value) +} + +func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { + vals := t.getKeyValues(lbs, now.Unix()) + t.receivedSamplesAttribution.WithLabelValues(vals...).Add(value) +} + +func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64) []string { + values := make([]string, len(t.trackedLabels)+1) + for i, l := range t.trackedLabels { + values[i] = lbls.Get(l) + if values[i] == "" { + values[i] = missingValue + } + } + values[len(values)-1] = t.userID + + var stream uint64 + stream, t.hashBuffer = lbls.HashForLabels(t.hashBuffer, t.trackedLabels...) + if t.overflow(stream, values, ts) { + // Omit last label. + for i := range values[:len(values)-1] { + values[i] = overflowValue + } + } + + return values +} + +func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { + // If the maximum cardinality is hit all streams become `__overflow__`. + if len(t.observed) > t.maxCardinality { + return true + } + + if o, known := t.observed[stream]; known && o.lastUpdate != nil && o.lastUpdate.Load() < ts { + o.lastUpdate.Store(ts) + } else { + t.observed[stream] = &observation{ + lvalues: values, + lastUpdate: atomic.NewInt64(ts), + } + } + + return false +} + +// we need the time stamp, since active series could have entered active stripe long time ago, and already evicted +// from the observed map but still in the active Stripe +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, value int64, ts time.Time) { + vals := t.getKeyValues(lbs, ts.Unix()) + t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() +} + +func newTracker(trackedLabels []string, limit int) (*Tracker, error) { + // keep tracked labels sorted for consistent metric labels + sort.Strings(trackedLabels) + m := &Tracker{ + trackedLabels: trackedLabels, + maxCardinality: limit, + oLock: sync.RWMutex{}, + observed: map[uint64]*observation{}, + //nolint:faillint // the metrics are registered in the mimir package + discardedSampleAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_discarded_samples_attribution_total", + Help: "The total number of samples that were discarded per attribution.", + }, append(trackedLabels, "user")), + //nolint:faillint + receivedSamplesAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_received_samples_attribution_total", + Help: "The total number of samples that were received per attribution.", + }, append(trackedLabels, "user")), + //nolint:faillint + activeSeriesPerUserAttribution: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_ingester_active_series_attribution", + Help: "The total number of active series per user and attribution.", + }, append(trackedLabels, "user")), + } + return m, nil +} + +func (t *Tracker) Collect(out chan<- prometheus.Metric) { + t.activeSeriesPerUserAttribution.Collect(out) + t.receivedSamplesAttribution.Collect(out) + t.discardedSampleAttribution.Collect(out) +} + +// Describe implements prometheus.Collector. +func (t *Tracker) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector +} + +func (t *Tracker) PurgeInactiveObservations(deadline int64) []*observation { + obs := t.observed + if obs == nil { + return nil + } + + var invalidKeys []uint64 + for labHash, ob := range obs { + if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { + invalidKeys = append(invalidKeys, labHash) + } + } + + if len(invalidKeys) == 0 { + return nil + } + + t.oLock.Lock() + defer t.oLock.Unlock() + + // Cleanup inactive observations and return all invalid observations to clean up metrics for them + res := make([]*observation, len(invalidKeys)) + for i := 0; i < len(invalidKeys); { + inactiveLab := invalidKeys[i] + ob := t.observed[inactiveLab] + if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { + delete(t.observed, inactiveLab) + res[i] = ob + i++ + } else { + invalidKeys[i] = invalidKeys[len(invalidKeys)-1] + invalidKeys = invalidKeys[:len(invalidKeys)-1] + } + } + + return res[:len(invalidKeys)] +} + +func (t *Tracker) updateMaxCardinality(limit int) { + // if we are reducing limit, we can just set it + if t.maxCardinality >= limit { + t.maxCardinality = limit + return + } + // if we are increasing limit, we need to check if we are already in overflow, + // if yes, reset the counter, otherwise the counters won't be correct + t.oLock.Lock() + defer t.oLock.Unlock() + if len(t.observed) > t.maxCardinality { + t.observed = map[uint64]*observation{} + } + t.maxCardinality = limit +} diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index eac4e482f66..0a5ad882b56 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -48,6 +48,7 @@ import ( "golang.org/x/sync/errgroup" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" ingester_client "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" @@ -105,7 +106,7 @@ type Distributor struct { distributorsLifecycler *ring.BasicLifecycler distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 - + costAttributionMgr *costattribution.Manager // For handling HA replicas. HATracker *haTracker @@ -306,7 +307,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -336,6 +337,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove healthyInstancesCount: atomic.NewUint32(0), limits: limits, HATracker: haTracker, + costAttributionMgr: costAttributionMgr, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -709,21 +711,21 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) error { - if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation); err != nil { + now := model.TimeFromUnixNano(nowt.UnixNano()) + cat := getCATrackerForUser(userID, d.costAttributionMgr) + if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return err } - now := model.TimeFromUnixNano(nowt.UnixNano()) - for _, s := range ts.Samples { - if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s); err != nil { + if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err } } histogramsUpdated := false for i := range ts.Histograms { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[i]) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[i], cat) if err != nil { return err } @@ -835,7 +837,8 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { } numSamples := 0 - group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), time.Now()) + now := time.Now() + group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), now) for _, ts := range req.Timeseries { numSamples += len(ts.Samples) + len(ts.Histograms) } @@ -849,6 +852,11 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) + if d.costAttributionMgr != nil { + if cat := d.costAttributionMgr.TrackerForUser(userID); cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) + } + } } return err @@ -1104,6 +1112,9 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { + if cat := getCATrackerForUser(userID, d.costAttributionMgr); cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) d.discardedMetadataRateLimited.WithLabelValues(userID).Add(float64(validatedMetadata)) @@ -1662,10 +1673,15 @@ func tokenForMetadata(userID string, metricName string) uint32 { } func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { + now := mtime.Now() var receivedSamples, receivedExemplars, receivedMetadata int + for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) + if cat := getCATrackerForUser(userID, d.costAttributionMgr); cat != nil { + cat.IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), now) + } } receivedMetadata = len(req.Metadata) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 21ce5adc2c4..3296d8ec50a 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -16,6 +16,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/extract" "github.com/grafana/mimir/pkg/util/globalerror" @@ -221,15 +222,22 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() + // if the validation failed, we need to increment the discarded samples metric + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) + } unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) + } unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -240,20 +248,29 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) + } m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) + } m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidNativeHistogramSchema, now.Time()) + } m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -267,6 +284,9 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) + } m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -274,6 +294,9 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) + } m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -372,22 +395,40 @@ func removeNonASCIIChars(in string) (out string) { return out } +// getCATrackerForUser returns the cost attribution tracker for the user. +// If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns nil. +func getCATrackerForUser(userID string, cam *costattribution.Manager) *costattribution.Tracker { + if cam == nil { + return nil + } + return cam.TrackerForUser(userID) +} + // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMissingMetricName, ts) + } m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidMetricName, ts) + } m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } if !skipLabelCountValidation && len(ls) > cfg.MaxLabelNamesPerSeries(userID) { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerSeries, ts) + } metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -398,17 +439,29 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabel, ts) + } return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelNameTooLong, ts) + } m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Value) > maxLabelValueLength { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelValueTooLong, ts) + } m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonDuplicateLabelNames, ts) + } m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 71044b5e348..d827097839f 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -13,10 +13,12 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/util/zeropool" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -44,9 +46,10 @@ type ActiveSeries struct { stripes [numStripes]seriesStripe deleted deletedSeries - // matchersMutex protects matchers and lastMatchersUpdate. + // matchersMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat matchersMutex sync.RWMutex matchers *asmodel.Matchers + cat *costattribution.Tracker lastMatchersUpdate time.Time // The duration after which series become inactive. @@ -63,8 +66,8 @@ type seriesStripe struct { // Unix nanoseconds. Only used by purge. Zero = unknown. // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). - oldestEntryTs atomic.Int64 - + oldestEntryTs atomic.Int64 + cat *costattribution.Tracker mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -73,6 +76,8 @@ type seriesStripe struct { activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. + userID string + buf labels.ScratchBuilder } // seriesEntry holds a timestamp for single series. @@ -80,16 +85,22 @@ type seriesEntry struct { nanos *atomic.Int64 // Unix timestamp in nanoseconds. Needs to be a pointer because we don't store pointers to entries in the stripe. matches asmodel.PreAllocDynamicSlice // Index of the matcher matching numNativeHistogramBuckets int // Number of buckets in native histogram series, -1 if not a native histogram. - + // keep the value corresponding the label configured in serieStripe deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration) *ActiveSeries { - c := &ActiveSeries{matchers: asm, timeout: timeout} +func NewActiveSeries( + asm *asmodel.Matchers, + timeout time.Duration, + cat *costattribution.Tracker, +) *ActiveSeries { + c := &ActiveSeries{ + matchers: asm, timeout: timeout, cat: cat, + } // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, cat) } return c @@ -106,7 +117,7 @@ func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { defer c.matchersMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, c.cat) } c.matchers = asm c.lastMatchersUpdate = now @@ -118,18 +129,24 @@ func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { return c.matchers.Config() } +func (c *ActiveSeries) CurrentCostAttributionTracker() *costattribution.Tracker { + c.matchersMutex.RLock() + defer c.matchersMutex.RUnlock() + return c.cat +} + // UpdateSeries updates series timestamp to 'now'. Function is called to make a copy of labels if entry doesn't exist yet. // Pass -1 in numNativeHistogramBuckets if the series is not a native histogram series. -func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int) { +func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int, idx tsdb.IndexReader) { stripeID := ref % numStripes - created := c.stripes[stripeID].updateSeriesTimestamp(now, series, ref, numNativeHistogramBuckets) if created { if deleted, ok := c.deleted.find(series); ok { deletedStripeID := deleted.ref % numStripes - c.stripes[deletedStripeID].remove(deleted.ref) + c.stripes[deletedStripeID].remove(deleted.ref, idx) } } + } // PostDeletion should be called when series are deleted from the head. @@ -149,20 +166,21 @@ func (c *ActiveSeries) PostDeletion(deleted map[chunks.HeadSeriesRef]labels.Labe // Purge purges expired entries and returns true if enough time has passed since // last reload. This should be called periodically to avoid unbounded memory // growth. -func (c *ActiveSeries) Purge(now time.Time) bool { +func (c *ActiveSeries) Purge(now time.Time, idx tsdb.IndexReader) bool { c.matchersMutex.Lock() defer c.matchersMutex.Unlock() purgeTime := now.Add(-c.timeout) - c.purge(purgeTime) + c.purge(purgeTime, idx) return !c.lastMatchersUpdate.After(purgeTime) } // purge removes expired entries from the cache. -func (c *ActiveSeries) purge(keepUntil time.Time) { +func (c *ActiveSeries) purge(keepUntil time.Time, idx tsdb.IndexReader) { for s := 0; s < numStripes; s++ { - c.stripes[s].purge(keepUntil) + c.stripes[s].purge(keepUntil, idx) } + } func (c *ActiveSeries) ContainsRef(ref storage.SeriesRef) bool { @@ -212,9 +230,9 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot return } -func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef) { +func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef, idx tsdb.IndexReader) { stripeID := storage.SeriesRef(ref) % numStripes - c.stripes[stripeID].remove(storage.SeriesRef(ref)) + c.stripes[stripeID].remove(storage.SeriesRef(ref), idx) } func (c *ActiveSeries) Clear() { @@ -375,6 +393,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef matchesLen := matches.Len() s.active++ + if numNativeHistogramBuckets >= 0 { s.activeNativeHistograms++ s.activeNativeHistogramBuckets += uint32(numNativeHistogramBuckets) @@ -394,6 +413,12 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } + // here if we have a cost attribution label, we can split the serie count based on the value of the label + // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly + if s.cat != nil { + s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) + } + s.refs[ref] = e return e.nanos, true } @@ -403,6 +428,7 @@ func (s *seriesStripe) clear() { defer s.mu.Unlock() s.oldestEntryTs.Store(0) + // TODO: s.refs = map[storage.SeriesRef]seriesEntry{} s.active = 0 s.activeNativeHistograms = 0 @@ -415,10 +441,13 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries) { +func (s *seriesStripe) reinitialize( + asm *asmodel.Matchers, + deleted *deletedSeries, + cat *costattribution.Tracker, +) { s.mu.Lock() defer s.mu.Unlock() - s.deleted = deleted s.oldestEntryTs.Store(0) s.refs = map[storage.SeriesRef]seriesEntry{} @@ -429,9 +458,11 @@ func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSerie s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) + s.cat = cat + s.buf = labels.NewScratchBuilder(128) } -func (s *seriesStripe) purge(keepUntil time.Time) { +func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { keepUntilNanos := keepUntil.UnixNano() if oldest := s.oldestEntryTs.Load(); oldest > 0 && keepUntilNanos <= oldest { // Nothing to do. @@ -449,13 +480,29 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(s.activeMatchingNativeHistogramBuckets), s.activeMatchingNativeHistogramBuckets) oldest := int64(math.MaxInt64) + buf := labels.NewScratchBuilder(128) for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { if entry.deleted { s.deleted.purge(ref) } + + // idx, err := db.Head().Index() + // err = idx.Series(seriesRef, &buf, nil) + // if err != nil { + // return fmt.Errorf("error getting series: %w", err) + // } + // m := &mimirpb.Metric{Labels: mimirpb.FromLabelsToLabelAdapters(buf.Labels())} + + if s.cat != nil && idx != nil { + if err := idx.Series(ref, &buf, nil); err != nil { + //TODO: think about what to do here + } + s.cat.DecrementActiveSeries(buf.Labels(), 1, keepUntil) + } delete(s.refs, ref) + // TODO: here need to find what is deleted and decrement counters continue } @@ -464,6 +511,7 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.activeNativeHistograms++ s.activeNativeHistogramBuckets += uint32(entry.numNativeHistogramBuckets) } + ml := entry.matches.Len() for i := 0; i < ml; i++ { match := entry.matches.Get(i) @@ -489,7 +537,7 @@ func (s *seriesStripe) purge(keepUntil time.Time) { // This is mostly the same logic from purge() but we decrement counters for a single entry instead of incrementing for each entry. // Note: we might remove the oldest series here, but the worst thing can happen is that we let run a useless purge() cycle later, // so this method doesn't update the oldestEntryTs. -func (s *seriesStripe) remove(ref storage.SeriesRef) { +func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { s.mu.Lock() defer s.mu.Unlock() @@ -502,6 +550,13 @@ func (s *seriesStripe) remove(ref storage.SeriesRef) { } s.active-- + if s.cat != nil && idx != nil { + if err := idx.Series(ref, &s.buf, nil); err != nil { + //TODO: think about what to do here + _ = err + } + s.cat.DecrementActiveSeries(s.buf.Labels(), 1, time.Now()) + } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 13b66bd64ca..b0323441234 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -51,6 +51,7 @@ import ( "golang.org/x/exp/slices" "golang.org/x/sync/errgroup" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester/activeseries" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" @@ -312,6 +313,8 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService + costAttributionMgr *costattribution.Manager + tsdbMetrics *tsdbMetrics forceCompactTrigger chan requestWithUsersAndCallback @@ -366,21 +369,21 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus limits: limits, logger: logger, - tsdbs: make(map[string]*userTSDB), - usersMetadata: make(map[string]*userMetricsMetadata), + tsdbs: make(map[string]*userTSDB), + usersMetadata: make(map[string]*userMetricsMetadata), + bucket: bucketClient, tsdbMetrics: newTSDBMetrics(registerer, logger), shipperMetrics: newShipperMetrics(registerer), forceCompactTrigger: make(chan requestWithUsersAndCallback), shipTrigger: make(chan requestWithUsersAndCallback), seriesHashCache: hashcache.NewSeriesHashCache(cfg.BlocksStorageConfig.TSDB.SeriesHashCacheMaxBytes), - - errorSamplers: newIngesterErrSamplers(cfg.ErrorSampleRate), + errorSamplers: newIngesterErrSamplers(cfg.ErrorSampleRate), }, nil } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -388,7 +391,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.ingestionRate = util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval) i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService - + i.costAttributionMgr = costAttributionMgr // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -769,6 +772,15 @@ func (i *Ingester) replaceMatchers(asm *asmodel.Matchers, userDB *userTSDB, now userDB.activeSeries.ReloadMatchers(asm, now) } +// getCATrackerForUser returns the cost attribution tracker for the user. +// If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns nil. +func getCATrackerForUser(userID string, cam *costattribution.Manager) *costattribution.Tracker { + if cam == nil { + return nil + } + return cam.TrackerForUser(userID) +} + func (i *Ingester) updateActiveSeries(now time.Time) { for _, userID := range i.getTSDBUsers() { userDB := i.getTSDB(userID) @@ -777,10 +789,13 @@ func (i *Ingester) updateActiveSeries(now time.Time) { } newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) - if newMatchersConfig.String() != userDB.activeSeries.CurrentConfig().String() { + newCostAttributionTracker := getCATrackerForUser(userID, i.costAttributionMgr) + if newMatchersConfig.String() != userDB.activeSeries.CurrentConfig().String() || newCostAttributionTracker != userDB.activeSeries.CurrentCostAttributionTracker() { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } - valid := userDB.activeSeries.Purge(now) + + idx, _ := userDB.Head().Index() + valid := userDB.activeSeries.Purge(now, idx) if !valid { // Active series config has been reloaded, exposing loading metric until MetricsIdleTimeout passes. i.metrics.activeSeriesLoading.WithLabelValues(userID).Set(1) @@ -1159,7 +1174,8 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // Keep track of some stats which are tracked only if the samples will be // successfully committed - stats pushStats + + stats = pushStats{} firstPartialErr error // updateFirstPartial is a function that, in case of a softError, stores that error @@ -1284,8 +1300,11 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.PreallocTimeseries, app extendedAppender, startAppend time.Time, stats *pushStats, updateFirstPartial func(sampler *util_log.Sampler, errFn softErrorFunction), activeSeries *activeseries.ActiveSeries, outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { - // Return true if handled as soft error, and we can ingest more series. + // get the cost attribution value for the series + + cat := getCATrackerForUser(userID, i.costAttributionMgr) + handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { stats.failedSamplesCount++ @@ -1295,6 +1314,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // we actually ingested all samples which haven't failed. switch { case errors.Is(err, storage.ErrOutOfBounds): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfBounds, startAppend) + } stats.sampleOutOfBoundsCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) @@ -1302,6 +1324,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrOutOfOrderSample): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) + } stats.sampleOutOfOrderCount++ updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) @@ -1309,6 +1334,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrTooOldSample): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) + } stats.sampleTooOldCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) @@ -1316,6 +1344,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.SampleTooFarInFuture): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) + } stats.sampleTooFarInFutureCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) @@ -1323,6 +1354,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrDuplicateSampleForTimestamp): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) + } stats.newValueForTimestampCount++ updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) @@ -1330,6 +1364,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.MaxSeriesPerUser): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) + } stats.perUserSeriesLimitCount++ updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) @@ -1337,6 +1374,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.MaxSeriesPerMetric): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) + } stats.perMetricSeriesLimitCount++ updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) @@ -1351,30 +1391,45 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre }) return true case errors.Is(err, histogram.ErrHistogramCountMismatch): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + } stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramCountNotBigEnough): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + } stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramNegativeBucketCount): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + } stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramSpanNegativeOffset): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + } stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramSpansBucketsMismatch): + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + } stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) @@ -1397,6 +1452,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels for _, ts := range timeseries { + // The labels must be sorted (in our case, it's guaranteed a write request // has sorted labels once hit the ingester). @@ -1412,7 +1468,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleOutOfBoundsCount += len(ts.Samples) + len(ts.Histograms) - + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleOutOfBounds, startAppend) + } var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1433,7 +1491,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) stats.sampleOutOfBoundsCount += len(ts.Samples) - + if cat != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleOutOfBounds, startAppend) + } firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -1554,7 +1614,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } if activeSeries != nil && stats.succeededSamplesCount > oldSucceededSamplesCount { - activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets) + idx, _ := i.getTSDB(userID).Head().Index() + // TODO: deal with the error here + activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets, idx) } if len(ts.Exemplars) > 0 && i.limits.MaxGlobalExemplarsPerUser(userID) > 0 { @@ -2647,9 +2709,14 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD ownedSeriedStateShardSize = i.ownedSeriesService.ringStrategy.shardSizeForUser(userID) } + cat := getCATrackerForUser(userID, i.costAttributionMgr) userDB := &userTSDB{ - userID: userID, - activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout), + userID: userID, + activeSeries: activeseries.NewActiveSeries( + asmodel.NewMatchers(matchersConfig), + i.cfg.ActiveSeriesMetrics.IdleTimeout, + cat, + ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), @@ -2664,6 +2731,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD localSeriesLimit: initialLocalLimit, }, } + userDB.triggerRecomputeOwnedSeries(recomputeOwnedSeriesReasonNewUser) oooTW := i.limits.OutOfOrderTimeWindow(userID) @@ -3239,7 +3307,12 @@ func (i *Ingester) compactBlocksToReduceInMemorySeries(ctx context.Context, now } // Purge the active series so that the next call to Active() will return the up-to-date count. - db.activeSeries.Purge(now) + idx, err := db.Head().Index() + if err != nil { + level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) + continue + } + db.activeSeries.Purge(now, idx) // Estimate the number of series that would be dropped from the TSDB Head if we would // compact the head up until "now - active series idle timeout". diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index 95bfe9840e2..e9766753525 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -624,7 +624,9 @@ func (u *userTSDB) computeOwnedSeries() int { if u.ownedTokenRanges.IncludesKey(sh) { count++ } else { - u.activeSeries.Delete(refs[i]) + idx, _ := u.Head().Index() + // TODO: deal with the err here + u.activeSeries.Delete(refs[i], idx) } } }) diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 7bcd3eac250..f3ed3543fa5 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -52,6 +52,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -145,9 +146,12 @@ type Config struct { ContinuousTest continuoustest.Config `yaml:"-"` OverridesExporter exporter.Config `yaml:"overrides_exporter"` - Common CommonConfig `yaml:"common"` + Common CommonConfig `yaml:"common"` + CustomRegistryPath string `yaml:"custom_registry_path" category:"advanced"` - TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` + TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` + CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` + CostAttributionCoolDownDuration time.Duration `yaml:"cost_attribution_cool_down_duration" category:"experimental"` } // RegisterFlags registers flags. @@ -170,10 +174,12 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.StringVar(&c.NoAuthTenant, "auth.no-auth-tenant", "anonymous", "Tenant ID to use when multitenancy is disabled.") f.BoolVar(&c.PrintConfig, "print.config", false, "Print the config and exit.") f.DurationVar(&c.ShutdownDelay, "shutdown-delay", 0, "How long to wait between SIGTERM and shutdown. After receiving SIGTERM, Mimir will report not-ready status via /ready endpoint.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution-eviction-interval", 30*time.Minute, "Time interval at which inactive cost attributions will be evicted from the cache.") + f.DurationVar(&c.CostAttributionCoolDownDuration, "cost-attribution-cool-down-duration", 20*time.Minute, "Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache.") f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") - + f.StringVar(&c.CustomRegistryPath, "custom-registry-path", "", "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) c.Distributor.RegisterFlags(f, logger) @@ -705,14 +711,16 @@ type Mimir struct { ServiceMap map[string]services.Service ModuleManager *modules.Manager - API *api.API - Server *server.Server - IngesterRing *ring.Ring - IngesterPartitionRingWatcher *ring.PartitionRingWatcher - IngesterPartitionInstanceRing *ring.PartitionInstanceRing - TenantLimits validation.TenantLimits - Overrides *validation.Overrides - ActiveGroupsCleanup *util.ActiveGroupsCleanupService + API *api.API + Server *server.Server + IngesterRing *ring.Ring + IngesterPartitionRingWatcher *ring.PartitionRingWatcher + IngesterPartitionInstanceRing *ring.PartitionInstanceRing + TenantLimits validation.TenantLimits + Overrides *validation.Overrides + ActiveGroupsCleanup *util.ActiveGroupsCleanupService + CostAttributionManager *costattribution.Manager + Distributor *distributor.Distributor Ingester *ingester.Ingester Flusher *flusher.Flusher diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index d95f21c1c5f..36b60160ad6 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -29,6 +29,7 @@ import ( "github.com/prometheus/alertmanager/featurecontrol" "github.com/prometheus/alertmanager/matchers/compat" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/common/config" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/rules" @@ -80,6 +81,7 @@ const ( OverridesExporter string = "overrides-exporter" Server string = "server" ActiveGroupsCleanupService string = "active-groups-cleanup-service" + CostAttributionService string = "cost-attribution-service" Distributor string = "distributor" DistributorService string = "distributor-service" Ingester string = "ingester" @@ -462,7 +464,9 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.Cfg.Distributor.PreferAvailabilityZone = t.Cfg.Querier.PreferAvailabilityZone t.Cfg.Distributor.IngestStorageConfig = t.Cfg.IngestStorage - t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, t.ActiveGroupsCleanup, t.IngesterRing, t.IngesterPartitionInstanceRing, canJoinDistributorsRing, t.Registerer, util_log.Logger) + t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, + t.ActiveGroupsCleanup, t.CostAttributionManager, t.IngesterRing, t.IngesterPartitionInstanceRing, + canJoinDistributorsRing, t.Registerer, util_log.Logger) if err != nil { return } @@ -644,6 +648,21 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { return t.ActiveGroupsCleanup, nil } +func (t *Mimir) initCostAttributionService() (services.Service, error) { + // The cost attribution service is only initilized if the custom registry path is provided. + if t.Cfg.CustomRegistryPath != "" { + // if custom registry path is provided, create a custom registry and use it for cost attribution service + customRegistry := prometheus.NewRegistry() + // Register the custom registry with the provided URL. + // This allows users to expose custom metrics on a separate endpoint. + // This is useful when users want to expose metrics that are not part of the default Mimir metrics. + http.Handle(t.Cfg.CustomRegistryPath, promhttp.HandlerFor(customRegistry, promhttp.HandlerOpts{Registry: customRegistry})) + err := customRegistry.Register(t.CostAttributionManager) + return t.CostAttributionManager, err + } + return nil, nil +} + func (t *Mimir) tsdbIngesterConfig() { t.Cfg.Ingester.BlocksStorageConfig = t.Cfg.BlocksStorage } @@ -655,7 +674,7 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.Cfg.Ingester.IngestStorageConfig = t.Cfg.IngestStorage t.tsdbIngesterConfig() - t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.Registerer, util_log.Logger) + t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.CostAttributionManager, t.Registerer, util_log.Logger) if err != nil { return } @@ -1136,6 +1155,7 @@ func (t *Mimir) setupModuleManager() error { mm.RegisterModule(Overrides, t.initOverrides, modules.UserInvisibleModule) mm.RegisterModule(OverridesExporter, t.initOverridesExporter) mm.RegisterModule(ActiveGroupsCleanupService, t.initActiveGroupsCleanupService, modules.UserInvisibleModule) + mm.RegisterModule(CostAttributionService, t.initCostAttributionService, modules.UserInvisibleModule) mm.RegisterModule(Distributor, t.initDistributor) mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule) mm.RegisterModule(Ingester, t.initIngester) @@ -1175,9 +1195,9 @@ func (t *Mimir) setupModuleManager() error { IngesterPartitionRing: {MemberlistKV, IngesterRing, API}, Overrides: {RuntimeConfig}, OverridesExporter: {Overrides, MemberlistKV, Vault}, - Distributor: {DistributorService, API, ActiveGroupsCleanupService, Vault}, + Distributor: {DistributorService, API, ActiveGroupsCleanupService, CostAttributionService, Vault}, DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault}, - Ingester: {IngesterService, API, ActiveGroupsCleanupService, Vault}, + Ingester: {IngesterService, API, ActiveGroupsCleanupService, CostAttributionService, Vault}, IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV}, Flusher: {Overrides, API}, Queryable: {Overrides, DistributorService, IngesterRing, IngesterPartitionRing, API, StoreQueryable, MemberlistKV}, diff --git a/pkg/streamingpromql/benchmarks/ingester.go b/pkg/streamingpromql/benchmarks/ingester.go index 6f3b5f04a9a..9107b66f64f 100644 --- a/pkg/streamingpromql/benchmarks/ingester.go +++ b/pkg/streamingpromql/benchmarks/ingester.go @@ -96,7 +96,7 @@ func startBenchmarkIngester(rootDataDir string) (*ingester.Ingester, string, fun return services.StopAndAwaitTerminated(context.Background(), ingestersRing) }) - ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, log.NewNopLogger()) + ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, nil, log.NewNopLogger()) if err != nil { cleanup() return nil, "", nil, fmt.Errorf("could not create ingester: %w", err) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index b49a7eebd25..de374f2b786 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -183,6 +183,10 @@ type Limits struct { LabelValuesMaxCardinalityLabelNamesPerRequest int `yaml:"label_values_max_cardinality_label_names_per_request" json:"label_values_max_cardinality_label_names_per_request"` ActiveSeriesResultsMaxSizeBytes int `yaml:"active_series_results_max_size_bytes" json:"active_series_results_max_size_bytes" category:"experimental"` + // Cost attribution and limit. + CostAttributionLabels flagext.StringSliceCSV `yaml:"cost_attribution_labels" json:"cost_attribution_labels" category:"experimental"` + MaxCostAttributionPerUser int `yaml:"max_cost_attribution_per_user" json:"max_cost_attribution_per_user" category:"experimental"` + // Ruler defaults and limits. RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` @@ -289,7 +293,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&l.OutOfOrderBlocksExternalLabelEnabled, "ingester.out-of-order-blocks-external-label-enabled", false, "Whether the shipper should label out-of-order blocks with an external label before uploading them. Setting this label will compact out-of-order blocks separately from non-out-of-order blocks") f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") - + f.Var(&l.CostAttributionLabels, "validation.cost-attribution-labels", "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.") + f.IntVar(&l.MaxCostAttributionPerUser, "validation.max-cost-attribution-per-user", 0, "Maximum number of cost attribution labels allowed per user.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -427,7 +432,6 @@ func (l *Limits) unmarshal(decode func(any) error) error { return err } l.extensions = getExtensions() - return l.validate() } @@ -779,6 +783,14 @@ func (o *Overrides) SeparateMetricsGroupLabel(userID string) string { return o.getOverridesForUser(userID).SeparateMetricsGroupLabel } +func (o *Overrides) CostAttributionLabel(userID string) []string { + return o.getOverridesForUser(userID).CostAttributionLabels +} + +func (o *Overrides) MaxCostAttributionPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionPerUser +} + // IngestionTenantShardSize returns the ingesters shard size for a given user. func (o *Overrides) IngestionTenantShardSize(userID string) int { return o.getOverridesForUser(userID).IngestionTenantShardSize From 0456d94fcd5396a9fbc3260237487aff084e267e Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 24 Oct 2024 15:40:00 +0200 Subject: [PATCH 02/32] test update --- pkg/costattribution/manager_test.go | 274 ++++++++++++++++++ pkg/costattribution/tracker_test.go | 50 ++++ pkg/distributor/distributor_test.go | 6 +- pkg/distributor/validate_test.go | 12 +- .../activeseries/active_labels_test.go | 6 +- .../active_native_histogram_postings_test.go | 32 +- .../activeseries/active_postings_test.go | 21 +- .../activeseries/active_series_test.go | 207 +++++++------ .../ingester_early_compaction_test.go | 2 +- pkg/ingester/ingester_ingest_storage_test.go | 2 +- pkg/ingester/ingester_test.go | 6 +- .../benchmarks/comparison_test.go | 2 +- 12 files changed, 475 insertions(+), 145 deletions(-) create mode 100644 pkg/costattribution/manager_test.go create mode 100644 pkg/costattribution/tracker_test.go diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go new file mode 100644 index 00000000000..8bdf56b5bc5 --- /dev/null +++ b/pkg/costattribution/manager_test.go @@ -0,0 +1,274 @@ +package costattribution + +// func newTestManager() *Manager { +// logger := log.NewNopLogger() +// limits, _ := validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(map[string]*validation.Limits{ +// "user1": { +// MaxCostAttributionPerUser: 5, +// CostAttributionLabel: "team", +// }, +// "user2": { +// MaxCostAttributionPerUser: 2, +// CostAttributionLabel: "", +// }, +// "user3": { +// MaxCostAttributionPerUser: 2, +// CostAttributionLabel: "department", +// }, +// })) +// inactiveTimeout := 2 * time.Minute +// cooldownTimeout := 1 * time.Minute +// cleanupInterval := 1 * time.Minute +// return NewManager(cleanupInterval, inactiveTimeout, cooldownTimeout, logger, limits) +// } + +// func Test_NewManager(t *testing.T) { +// manager := newTestManager() +// assert.NotNil(t, manager, "Expected manager to be initialized") +// assert.NotNil(t, manager.attributionTracker, "Expected attribution tracker to be initialized") +// assert.Equal(t, "__overflow__", manager.invalidValue, "Expected invalidValue to be initialized") +// } + +// func Test_EnabledForUser(t *testing.T) { +// manager := newTestManager() +// assert.True(t, manager.EnabledForUser("user1"), "Expected cost attribution to be enabled for user1") +// assert.False(t, manager.EnabledForUser("user2"), "Expected cost attribution to be disabled for user2") +// assert.False(t, manager.EnabledForUser("user4"), "Expected cost attribution to be disabled for user4") +// } + +// func Test_UserAttributionLabel(t *testing.T) { +// manager := newTestManager() +// assert.Equal(t, "team", manager.UserAttributionLabel("user1")) +// assert.Equal(t, "", manager.UserAttributionLabel("user2")) +// assert.Equal(t, "department", manager.UserAttributionLabel("user3")) +// assert.Equal(t, 2, len(manager.attributionTracker.trackersByUserID)) +// assert.Equal(t, "team", manager.attributionTracker.trackersByUserID["user1"].trackedLabel) +// assert.Equal(t, "department", manager.attributionTracker.trackersByUserID["user3"].trackedLabel) +// } + +// func Test_UserAttributionLimit(t *testing.T) { +// manager := newTestManager() +// assert.Equal(t, 5, manager.UserAttributionLimit("user1")) +// assert.Equal(t, 0, manager.UserAttributionLimit("user2")) +// assert.Equal(t, 0, manager.UserAttributionLimit("user4")) +// } + +// func Test_UpdateAttributionTimestamp(t *testing.T) { +// manager := newTestManager() + +// lbls := labels.NewBuilder(labels.EmptyLabels()) +// tm1, tm2, tm3 := "bar", "foo", "baz" +// t.Run("Should update the timestamp when limit not reached for the user attribution", func(t *testing.T) { +// lbls.Set("department", tm1) +// isOutdated, result := manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(0, 0)) +// assert.False(t, isOutdated, "Expected label to be the same as the one in the cache") +// assert.Equal(t, tm1, result, "Expected attribution to be returned since user is enabled for cost attribution, and limit is not reached") +// assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].observed[tm1]) +// assert.Equal(t, int64(0), manager.attributionTracker.trackersByUserID["user3"].observed[tm1].Load()) + +// lbls.Set("department", tm2) +// isOutdated, result = manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(1, 0)) +// assert.False(t, isOutdated) +// assert.Equal(t, tm2, result, "Expected attribution to be returned since user is enabled for cost attribution, and limit is not reached") +// assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].observed[tm2]) +// assert.Equal(t, int64(1), manager.attributionTracker.trackersByUserID["user3"].observed[tm2].Load()) +// }) + +// t.Run("Should only update the timestamp of invalide when limit reached for the user attribution", func(t *testing.T) { +// lbls.Set("department", tm3) +// isOutdated, result := manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(2, 0)) +// assert.False(t, isOutdated) +// assert.Equal(t, manager.invalidValue, result, "Expected invalidValue to be returned since user has reached the limit of cost attribution labels") +// assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].observed[manager.invalidValue]) +// assert.Equal(t, int64(2), manager.attributionTracker.trackersByUserID["user3"].observed[manager.invalidValue].Load()) + +// lbls.Set("department", tm1) +// isOutdated, result = manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(3, 0)) +// assert.False(t, isOutdated) +// assert.Equal(t, manager.invalidValue, result, "Expected invalidValue to be returned since user has reached the limit of cost attribution labels") +// assert.Equal(t, int64(3), manager.attributionTracker.trackersByUserID["user3"].observed[manager.invalidValue].Load()) +// }) +// } + +// func Test_SetActiveSeries(t *testing.T) { +// manager := newTestManager() +// reg := prometheus.NewRegistry() +// err := reg.Register(manager) +// require.NoError(t, err) +// userID := "user1" + +// lbls := labels.NewBuilder(labels.EmptyLabels()) + +// t.Run("Should set the active series gauge for the given user and attribution", func(t *testing.T) { +// lbls.Set("team", "foo") +// isOutdated, val := manager.UpdateAttributionTimestamp(userID, "team", lbls.Labels(), time.Unix(0, 0)) +// assert.False(t, isOutdated) +// manager.SetActiveSeries(userID, "team", val, 1.0) +// expectedMetrics := ` +// # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. +// # TYPE cortex_ingester_active_series_attribution gauge +// cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 +// ` +// metricNames := []string{ +// "cortex_ingester_active_series_attribution", +// } +// assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) +// }) + +// t.Run("Should set the active series gauge for all users and attributions enabled and ignore disabled user", func(t *testing.T) { +// userID = "user3" +// lbls.Set("department", "bar") +// isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(0, 0)) +// assert.False(t, isOutdated) +// manager.SetActiveSeries(userID, "department", val, 2.0) + +// lbls.Set("department", "baz") +// isOutdated, val = manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(1, 0)) +// assert.False(t, isOutdated) +// manager.SetActiveSeries(userID, "department", val, 3.0) + +// expectedMetrics := ` +// # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. +// # TYPE cortex_ingester_active_series_attribution gauge +// cortex_ingester_active_series_attribution{department="bar",user="user3"} 2 +// cortex_ingester_active_series_attribution{department="baz",user="user3"} 3 +// cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 +// ` +// metricNames := []string{ +// "cortex_ingester_active_series_attribution", +// } +// assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) +// }) + +// t.Run("Cleanup the active series gauge for the given user and attribution when cost attribution disabled", func(t *testing.T) { +// limits := manager.attributionTracker.limits +// defer func() { manager.attributionTracker.limits = limits }() +// userID = "user3" +// lbls.Set("department", "baz") + +// overrides, _ := validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(map[string]*validation.Limits{ +// userID: { +// MaxCostAttributionPerUser: 2, +// CostAttributionLabel: "", +// }, +// })) +// manager.attributionTracker.limits = overrides +// isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(5, 0)) +// assert.False(t, isOutdated) +// manager.SetActiveSeries(userID, val, "department", 3.0) + +// expectedMetrics := ` +// # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. +// # TYPE cortex_ingester_active_series_attribution gauge +// cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 +// ` +// metricNames := []string{ +// "cortex_ingester_active_series_attribution", +// } +// assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) +// }) + +// t.Run("Should ignore setting the active series gauge for disabled user", func(t *testing.T) { +// userID = "user2" +// lbls.Set("department", "bar") +// isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(0, 0)) +// assert.False(t, isOutdated) +// manager.SetActiveSeries(userID, val, "department", 4.0) + +// expectedMetrics := ` +// # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. +// # TYPE cortex_ingester_active_series_attribution gauge +// cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 +// ` +// metricNames := []string{ +// "cortex_ingester_active_series_attribution", +// } +// assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) +// }) +// } + +// func TestUpdateAttributionTimestampForUser(t *testing.T) { +// cooldownTimeout := 10 * time.Second +// t.Run("Should not update the timestamp for the user if attribution lable is not set", func(t *testing.T) { +// // Create mock limits +// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "", MaxCostAttributionPerUser: 5}, nil) +// assert.NoError(t, err) +// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) +// assert.NotNil(t, trackerGroup) + +// ts := time.Unix(1, 0) +// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "platformA", ts) +// trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "teamB", ts) + +// assert.Equal(t, 0, len(trackerGroup.trackersByUserID)) +// }) + +// t.Run("Should not update the timestamp for the user if max cost attribution per user is 0", func(t *testing.T) { +// // Create mock limits +// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 0}, nil) +// assert.NoError(t, err) + +// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) +// assert.NotNil(t, trackerGroup) + +// ts := time.Unix(1, 0) +// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "platformA", ts) +// trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "teamB", ts) + +// assert.Equal(t, 0, len(trackerGroup.trackersByUserID)) +// }) + +// t.Run("Should update the timestamp for the user attribution", func(t *testing.T) { +// // Create mock limits +// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) +// assert.NoError(t, err) + +// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) +// assert.NotNil(t, trackerGroup) + +// ts := time.Unix(1, 0) +// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", ts) +// trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "barA", ts) + +// assert.Equal(t, 2, len(trackerGroup.trackersByUserID)) +// fmt.Println(trackerGroup.trackersByUserID) +// assert.NotNil(t, trackerGroup.trackersByUserID["tenantA"]) +// assert.NotNil(t, trackerGroup.trackersByUserID["tenantA"].observed["fooA"]) +// assert.Equal(t, int64(1), trackerGroup.trackersByUserID["tenantA"].observed["fooA"].Load()) + +// trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "barA", ts.Add(time.Second)) +// assert.Equal(t, int64(2), trackerGroup.trackersByUserID["tenantB"].observed["barA"].Load()) +// }) +// } + +// func TestUserAttributionLabel(t *testing.T) { +// cooldownTimeout := 10 * time.Second +// t.Run("Should return the cost attribution label for the user", func(t *testing.T) { +// // Create mock limits +// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) +// assert.NoError(t, err) + +// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) +// assert.NotNil(t, trackerGroup) +// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", time.Unix(0, 0)) + +// assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) +// }) + +// t.Run("Should return the default cost attribution label for the user if it is in cache", func(t *testing.T) { +// // Create mock limits +// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) +// assert.NoError(t, err) + +// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) +// assert.NotNil(t, trackerGroup) + +// assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) + +// // update the timestamp for the user, so cache is updated +// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", time.Unix(0, 0)) + +// // still read the cost attribution label from cache until cache is updated by timed service +// assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) +// }) +// } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go new file mode 100644 index 00000000000..38063880119 --- /dev/null +++ b/pkg/costattribution/tracker_test.go @@ -0,0 +1,50 @@ +package costattribution + +import ( + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func Test_NewTracker(t *testing.T) { + reg := prometheus.NewRegistry() + + // Initialize a new Tracker + trackedLabel := []string{"platform"} + cat, err := newTracker(trackedLabel, 5) + require.NoError(t, err) + err = reg.Register(cat) + require.NoError(t, err) + + // Simulate some values in the metrics + vals := []string{"foo", "user1"} + cat.activeSeriesPerUserAttribution.WithLabelValues(vals...).Set(1.0) + cat.receivedSamplesAttribution.WithLabelValues(vals...).Add(5) + cat.discardedSampleAttribution.WithLabelValues(vals...).Add(2) + + expectedMetrics := ` + # HELP cortex_discarded_samples_attribution_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_samples_attribution_total counter + cortex_discarded_samples_attribution_total{platform="foo",user="user1"} 2 + # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. + # TYPE cortex_ingester_active_series_attribution gauge + cortex_ingester_active_series_attribution{platform="foo",user="user1"} 1 + # HELP cortex_received_samples_attribution_total The total number of samples that were received per attribution. + # TYPE cortex_received_samples_attribution_total counter + cortex_received_samples_attribution_total{platform="foo",user="user1"} 5 + ` + + metricNames := []string{ + "cortex_discarded_samples_attribution_total", + "cortex_received_samples_attribution_total", + "cortex_ingester_active_series_attribution", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + + // Clean the tracker for the user attribution + cat.cleanupTrackerAttribution(vals) +} diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index e6a26355d17..7efcf14b25b 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -2035,7 +2035,7 @@ func BenchmarkDistributor_Push(b *testing.B) { require.NoError(b, err) // Start the distributor. - distributor, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(b, err) require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) @@ -5323,7 +5323,7 @@ func prepare(t testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, []* require.NoError(t, err) reg := prometheus.NewPedanticRegistry() - d, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) + d, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, d)) t.Cleanup(func() { @@ -7957,7 +7957,7 @@ func TestCheckStartedMiddleware(t *testing.T) { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - distributor, err := New(distributorConfig, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorConfig, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(t, err) ctx := user.InjectOrgID(context.Background(), "user") diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index ffb91b9d626..71c6e92415f 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -55,6 +55,7 @@ func (vm validateMetadataCfg) MaxMetadataLength(_ string) int { } func TestValidateLabels(t *testing.T) { + ts := time.Now() reg := prometheus.NewPedanticRegistry() s := newSampleValidationMetrics(reg) @@ -197,7 +198,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, nil, ts) assert.Equal(t, c.err, err, "wrong error") } @@ -390,6 +391,7 @@ func TestValidateMetadata(t *testing.T) { } func TestValidateLabelDuplication(t *testing.T) { + ts := time.Now() var cfg validateLabelsCfg cfg.maxLabelNameLength = 10 cfg.maxLabelNamesPerSeries = 10 @@ -400,7 +402,7 @@ func TestValidateLabelDuplication(t *testing.T) { actual := validateLabels(newSampleValidationMetrics(nil), cfg, userID, "", []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "a"}, {Name: model.MetricNameLabel, Value: "b"}, - }, false, false) + }, false, false, nil, ts) expected := fmt.Errorf( duplicateLabelMsgFormat, model.MetricNameLabel, @@ -417,7 +419,7 @@ func TestValidateLabelDuplication(t *testing.T) { {Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}, {Name: "a", Value: "a"}, - }, false, false) + }, false, false, nil, ts) expected = fmt.Errorf( duplicateLabelMsgFormat, "a", @@ -576,7 +578,7 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { cfg.maxNativeHistogramBuckets = limit ls := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, nil) if limit == 1 { require.Error(t, err) @@ -623,7 +625,7 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { hist.Schema = testCase.schema - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, nil) require.Equal(t, testCase.expectedError, err) }) } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index aa7f928d7dd..6fdf3e00bc4 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,7 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) memPostings := index.NewMemPostings() for i, l := range series { @@ -51,10 +51,10 @@ func TestIsLabelValueActive(t *testing.T) { // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) require.True(t, valid) result, err := IsLabelValueActive(ctx, reader, activeSeries, "a", "1") diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 665f5787c61..2b95020c68d 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,7 +26,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -34,10 +34,10 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { if i+1 == 3 || i+1 == 4 { buckets = 10 // Native histogram with 10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -62,7 +62,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -70,10 +70,10 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { if i == 2 || i == 3 { buckets = i * 10 // Native histogram with i*10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 5, allActive) @@ -106,17 +106,18 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 if i+1 == 4 { buckets = -1 // Make ref==4 not a native histogram to check that Seek skips it. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -145,14 +146,15 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -181,14 +183,14 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index a2345841d11..84c71634e72 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,13 +26,14 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -57,13 +58,14 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -88,13 +90,14 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index cf821c5bca5..0e20ecb6f78 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -38,9 +38,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - - valid := c.Purge(time.Now()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -50,8 +49,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveBuckets) assert.Empty(t, activeMatchingBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -62,8 +61,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -74,8 +73,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -86,8 +85,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -98,8 +97,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 5, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -111,8 +110,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 8, allActiveBuckets) // more buckets for a histogram - c.UpdateSeries(ls3, ref3, time.Now(), 7) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 7, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -124,8 +123,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 10, allActiveBuckets) // changing a metric from histogram to float - c.UpdateSeries(ls4, ref4, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -150,7 +149,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -162,7 +161,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // ref5 is created with the same labelset as ls1, it shouldn't be accounted as different series. - c.UpdateSeries(ls1, ref5, time.Now(), -1) + c.UpdateSeries(ls1, ref5, time.Now(), -1, nil) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) assert.Equal(t, 1, allActiveHistograms) @@ -173,7 +172,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -204,19 +203,19 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // The expected number of series is the total number of series minus the ttl // because the first ttl series should be purged exp := len(series) - (ttl) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -231,7 +230,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) } @@ -243,7 +242,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { ref5, ls5 := storage.SeriesRef(5), labels.FromStrings("a", "5") ref6 := storage.SeriesRef(6) // same as ls2 - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -257,8 +256,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -272,8 +271,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -287,8 +286,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -302,8 +301,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -317,8 +316,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -332,8 +331,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 3, allActiveBuckets) - c.UpdateSeries(ls5, ref5, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls5, ref5, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -348,8 +347,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 8, allActiveBuckets) // changing a metric from float to histogram - c.UpdateSeries(ls3, ref3, time.Now(), 6) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 6, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -364,8 +363,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 14, allActiveBuckets) // fewer (zero) buckets for a histogram - c.UpdateSeries(ls4, ref4, time.Now(), 0) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 0, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -397,7 +396,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -412,7 +411,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // ls2 is pushed again, this time with ref6 - c.UpdateSeries(ls2, ref6, time.Now(), -1) + c.UpdateSeries(ls2, ref6, time.Now(), -1, nil) // Numbers don't change. allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -427,7 +426,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -448,7 +447,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) c.Clear() @@ -489,11 +488,11 @@ func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - c.UpdateSeries(ls2, ref2, time.Now(), -1) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -517,22 +516,22 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } c.PostDeletion(map[chunks.HeadSeriesRef]labels.Labels{ deletedRef: deletedLabels, }) - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) exp := len(series) - (ttl) // Purge is not intended to purge - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -563,13 +562,13 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute) + c := NewActiveSeries(asm, 5*time.Minute, nil) exp := len(series) - ttl expMatchingSeries := 0 for i, s := range series { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) // if this series is matching, and they're within the ttl tmp := asm.Matches(s) @@ -578,11 +577,11 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { } } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -596,28 +595,28 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, nil) - c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 1, allActive) - c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) // This will *not* update the series, since there is already newer timestamp. - c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1) + c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -632,30 +631,30 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) assert.Equal(t, []int{1}, activeMatching) c.ReloadMatchers(asm, currentTime) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.False(t, valid) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls1, ref1, currentTime, -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -666,8 +665,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls3, ref3, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls3, ref3, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -681,8 +680,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls4, ref4, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls4, ref4, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -698,15 +697,15 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -717,10 +716,10 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -737,15 +736,15 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -757,11 +756,11 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -790,7 +789,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0) + c = NewActiveSeries(&asmodel.Matchers{}, 0, nil) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -824,7 +823,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo nextSeriesID = 0 } - c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1) + c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1, nil) } }(i) } @@ -841,7 +840,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo case <-stopPurge: return default: - c.Purge(future()) + c.Purge(future(), nil) } // Throttle, but keep high pressure from Purge(). @@ -928,10 +927,10 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { - c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) + c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1, nil) now++ } } @@ -953,7 +952,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} @@ -968,13 +967,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Prepare series for ix, s := range series { if ix < numExpiresSeries { - c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1) + c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1, nil) } else { - c.UpdateSeries(s, refs[ix], currentTime, -1) + c.UpdateSeries(s, refs[ix], currentTime, -1, nil) } } - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(b, numSeries, allActive) @@ -982,13 +981,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Purge is going to purge everything currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) if twice { - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) diff --git a/pkg/ingester/ingester_early_compaction_test.go b/pkg/ingester/ingester_early_compaction_test.go index 5cf29b7de99..a63d397836f 100644 --- a/pkg/ingester/ingester_early_compaction_test.go +++ b/pkg/ingester/ingester_early_compaction_test.go @@ -130,7 +130,7 @@ func TestIngester_compactBlocksToReduceInMemorySeries_ShouldTriggerCompactionOnl require.Len(t, listBlocksInDir(t, userBlocksDir), 0) // Use a trick to track all series we've written so far as "inactive". - ingester.getTSDB(userID).activeSeries.Purge(now.Add(30 * time.Minute)) + ingester.getTSDB(userID).activeSeries.Purge(now.Add(30*time.Minute), nil) // Pre-condition check. require.Equal(t, uint64(10), ingester.getTSDB(userID).Head().NumSeries()) diff --git a/pkg/ingester/ingester_ingest_storage_test.go b/pkg/ingester/ingester_ingest_storage_test.go index 4a529321155..fcf79dd4bc7 100644 --- a/pkg/ingester/ingester_ingest_storage_test.go +++ b/pkg/ingester/ingester_ingest_storage_test.go @@ -650,7 +650,7 @@ func createTestIngesterWithIngestStorage(t testing.TB, ingesterCfg *Config, over require.NoError(t, services.StopAndAwaitTerminated(ctx, prw)) }) - ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, reg, util_test.NewTestingLogger(t)) + ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, nil, reg, util_test.NewTestingLogger(t)) require.NoError(t, err) return ingester, kafkaCluster, prw diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 5ecadbcef15..2db6f7968d9 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -6068,7 +6068,7 @@ func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, i ingestersRing = createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()) } - ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) + ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) if err != nil { return nil, err } @@ -6274,7 +6274,7 @@ func TestIngester_OpenExistingTSDBOnStartup(t *testing.T) { // setup the tsdbs dir testData.setup(t, tempDir) - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) startErr := services.StartAndAwaitRunning(context.Background(), ingester) @@ -7434,7 +7434,7 @@ func TestHeadCompactionOnStartup(t *testing.T) { ingesterCfg.BlocksStorageConfig.Bucket.S3.Endpoint = "localhost" ingesterCfg.BlocksStorageConfig.TSDB.Retention = 2 * 24 * time.Hour // Make sure that no newly created blocks are deleted. - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), ingester)) diff --git a/pkg/streamingpromql/benchmarks/comparison_test.go b/pkg/streamingpromql/benchmarks/comparison_test.go index 93407d75fb3..2615846dd98 100644 --- a/pkg/streamingpromql/benchmarks/comparison_test.go +++ b/pkg/streamingpromql/benchmarks/comparison_test.go @@ -237,7 +237,7 @@ func createIngesterQueryable(t testing.TB, address string) storage.Queryable { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, ingestersRing, nil, false, nil, logger) + d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, nil, ingestersRing, nil, false, nil, logger) require.NoError(t, err) queryMetrics := stats.NewQueryMetrics(nil) From f4ebc07871ef79a0ed7ee31c48032a9fd42fe757 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 25 Oct 2024 16:54:32 +0200 Subject: [PATCH 03/32] address review comments and fix lint and test Signed-off-by: Ying WANG --- cmd/mimir/config-descriptor.json | 21 +-- cmd/mimir/help-all.txt.tmpl | 10 +- pkg/costattribution/manager.go | 63 ++++---- pkg/costattribution/tracker.go | 138 +++++++++++------- pkg/costattribution/tracker_test.go | 26 ++-- pkg/distributor/distributor.go | 8 +- pkg/distributor/validate.go | 66 +++------ pkg/distributor/validate_test.go | 11 +- .../activeseries/active_labels_test.go | 3 +- .../active_native_histogram_postings_test.go | 11 +- .../activeseries/active_postings_test.go | 7 +- pkg/ingester/activeseries/active_series.go | 99 +++++++------ .../activeseries/active_series_test.go | 29 ++-- pkg/ingester/ingester.go | 74 +++------- pkg/ingester/user_tsdb.go | 4 +- pkg/mimir/mimir.go | 4 +- pkg/mimir/modules.go | 2 + pkg/util/validation/limits.go | 13 +- 18 files changed, 286 insertions(+), 303 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index e05a43e7932..edc308348fd 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4360,12 +4360,12 @@ }, { "kind": "field", - "name": "max_cost_attribution_per_user", + "name": "max_cost_attribution_cardinality_per_user", "required": false, - "desc": "Maximum number of cost attribution labels allowed per user.", + "desc": "Maximum cardinality of cost attribution labels allowed per user.", "fieldValue": null, "fieldDefaultValue": 0, - "fieldFlag": "validation.max-cost-attribution-per-user", + "fieldFlag": "validation.max-cost-attribution-cardinality-per-user", "fieldType": "int", "fieldCategory": "experimental" }, @@ -18394,21 +18394,10 @@ "kind": "field", "name": "cost_attribution_eviction_interval", "required": false, - "desc": "Time interval at which inactive cost attributions will be evicted from the cache.", + "desc": "Time interval at which inactive cost attributions will be evicted from the counter, so it won't be counted when checking max_cost_attribution_cardinality_per_user.", "fieldValue": null, "fieldDefaultValue": 1800000000000, - "fieldFlag": "cost-attribution-eviction-interval", - "fieldType": "duration", - "fieldCategory": "experimental" - }, - { - "kind": "field", - "name": "cost_attribution_cool_down_duration", - "required": false, - "desc": "Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache.", - "fieldValue": null, - "fieldDefaultValue": 1200000000000, - "fieldFlag": "cost-attribution-cool-down-duration", + "fieldFlag": "cost-attribution.eviction-interval", "fieldType": "duration", "fieldCategory": "experimental" } diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 9977eed5e61..a5bac0676c5 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1139,10 +1139,8 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. - -cost-attribution-cool-down-duration duration - [experimental] Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache. (default 20m0s) - -cost-attribution-eviction-interval duration - [experimental] Time interval at which inactive cost attributions will be evicted from the cache. (default 30m0s) + -cost-attribution.eviction-interval duration + [experimental] Time interval at which inactive cost attributions will be evicted from the counter, so it won't be counted when checking max_cost_attribution_cardinality_per_user. (default 30m0s) -custom-registry-path string Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. -debug.block-profile-rate int @@ -3109,8 +3107,8 @@ Usage of ./cmd/mimir/mimir: Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) - -validation.max-cost-attribution-per-user int - [experimental] Maximum number of cost attribution labels allowed per user. + -validation.max-cost-attribution-cardinality-per-user int + [experimental] Maximum cardinality of cost attribution labels allowed per user. -validation.max-label-names-per-series int Maximum number of label names per series. (default 30) -validation.max-length-label-name int diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index cda08a6eaa3..c1e3f1d7a94 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -23,21 +23,19 @@ type Manager struct { logger log.Logger inactiveTimeout time.Duration limits *validation.Overrides - cooldownTimeout time.Duration // mu protects the trackersByUserID map - tlock sync.RWMutex - trackersByUserID map[string]*Tracker + mtx sync.RWMutex + trackersByUserID map[string]*TrackerImp } // NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series. // It will clean up inactive series and update the cost attribution of series every 3 minutes. -func NewManager(cleanupInterval, inactiveTimeout time.Duration, cooldownTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager { +func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager { s := &Manager{ - trackersByUserID: make(map[string]*Tracker), + trackersByUserID: make(map[string]*TrackerImp), limits: limits, - tlock: sync.RWMutex{}, - cooldownTimeout: cooldownTimeout, + mtx: sync.RWMutex{}, inactiveTimeout: inactiveTimeout, logger: logger, } @@ -53,27 +51,27 @@ func (m *Manager) iteration(_ context.Context) error { // EnabledForUser returns true if the cost attribution is enabled for the user func (m *Manager) EnabledForUser(userID string) bool { - return len(m.limits.CostAttributionLabel(userID)) > 0 + return len(m.limits.CostAttributionLabels(userID)) > 0 } -func (m *Manager) TrackerForUser(userID string) *Tracker { +func (m *Manager) TrackerForUser(userID string) Tracker { // if cost attribution is not enabled, return nil if !m.EnabledForUser(userID) { - return nil + return NewNoopTracker() } - m.tlock.Lock() - defer m.tlock.Unlock() + m.mtx.Lock() + defer m.mtx.Unlock() // if not exists, create a new tracker if _, exists := m.trackersByUserID[userID]; !exists { - m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID)) + m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) } return m.trackersByUserID[userID] } func (m *Manager) Collect(out chan<- prometheus.Metric) { - m.tlock.RLock() - defer m.tlock.RUnlock() + m.mtx.RLock() + defer m.mtx.RUnlock() for _, tracker := range m.trackersByUserID { tracker.Collect(out) } @@ -86,8 +84,8 @@ func (m *Manager) Describe(chan<- *prometheus.Desc) { // deleteUserTracer is delete user tracker since the user is disabled for cost attribution func (m *Manager) deleteUserTracer(userID string) { - m.tlock.Lock() - defer m.tlock.Unlock() + m.mtx.Lock() + defer m.mtx.Unlock() if _, exists := m.trackersByUserID[userID]; !exists { return } @@ -99,24 +97,23 @@ func (m *Manager) deleteUserTracer(userID string) { func (m *Manager) purgeInactiveAttributions(inactiveTimeout time.Duration) { // Get all userIDs from the map - m.tlock.RLock() + m.mtx.RLock() userIDs := make([]string, 0, len(m.trackersByUserID)) for userID := range m.trackersByUserID { userIDs = append(userIDs, userID) } - m.tlock.RUnlock() + m.mtx.RUnlock() // Iterate over all userIDs and purge inactive attributions of each user currentTime := time.Now() for _, userID := range userIDs { // if cost attribution is not enabled for the user, delete the user tracker and continue - if len(m.limits.CostAttributionLabel(userID)) == 0 || m.limits.MaxCostAttributionPerUser(userID) <= 0 { + if len(m.limits.CostAttributionLabels(userID)) == 0 || m.limits.MaxCostAttributionCardinalityPerUser(userID) <= 0 { m.deleteUserTracer(userID) continue } // get all inactive attributions for the user and clean up the tracker inactiveObs := m.purgeInactiveObservationsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano()) - for _, ob := range inactiveObs { m.trackersByUserID[userID].cleanupTrackerAttribution(ob.lvalues) } @@ -136,23 +133,27 @@ func compareStringSlice(a, b []string) bool { return true } -func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*observation { +func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*Observation { cat := m.TrackerForUser(userID) - if cat == nil { + if _, ok := cat.(*NoopTracker); ok { + // It's a noop implementation return nil } - newTrackedLabels := sort.StringSlice(m.limits.CostAttributionLabel(userID)) - // if they are different, we need to update the tracker, we don't mind, just reinitalized the tracker - if !compareStringSlice(cat.trackedLabels, newTrackedLabels) { - m.tlock.Lock() - m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID)) + newTrackedLabels := m.limits.CostAttributionLabels(userID) + sort.Slice(newTrackedLabels, func(i, j int) bool { + return newTrackedLabels[i] < newTrackedLabels[j] + }) + // if they are different, we need to update the tracker, we don't mind, just reinitialized the tracker + if !compareStringSlice(cat.GetCALabels(), newTrackedLabels) { + m.mtx.Lock() + m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) // update the tracker with the new tracker cat = m.trackersByUserID[userID] - m.tlock.Unlock() - } else if maxCardinality := m.limits.MaxCostAttributionPerUser(userID); cat.maxCardinality != maxCardinality { + m.mtx.Unlock() + } else if maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID); cat.GetMaxCardinality() != maxCardinality { // if the maxCardinality is different, update the tracker - cat.updateMaxCardinality(maxCardinality) + cat.UpdateMaxCardinality(maxCardinality) } return cat.PurgeInactiveObservations(deadline) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index d9f61cbda93..f888067426f 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -12,77 +12,100 @@ import ( "go.uber.org/atomic" ) -type observation struct { +type Tracker interface { + IncrementActiveSeries(labels.Labels, time.Time) + IncrementDiscardedSamples(labels.Labels, float64, string, time.Time) + IncrementReceivedSamples(labels.Labels, float64, time.Time) + DecrementActiveSeries(labels.Labels, time.Time) + PurgeInactiveObservations(int64) []*Observation + UpdateMaxCardinality(int) + GetMaxCardinality() int + GetCALabels() []string +} + +type Observation struct { lvalues []string lastUpdate *atomic.Int64 } -func (t *Tracker) cleanupTrackerAttribution(vals []string) { +func (t *TrackerImp) GetCALabels() []string { + return t.caLabels +} + +func (t *TrackerImp) GetMaxCardinality() int { + return t.maxCardinality +} + +func (t *TrackerImp) cleanupTrackerAttribution(vals []string) { t.activeSeriesPerUserAttribution.DeleteLabelValues(vals...) t.receivedSamplesAttribution.DeleteLabelValues(vals...) t.discardedSampleAttribution.DeleteLabelValues(vals...) } -func (t *Tracker) cleanupTracker(userID string) { +func (t *TrackerImp) cleanupTracker(userID string) { filter := prometheus.Labels{"user": userID} t.activeSeriesPerUserAttribution.DeletePartialMatch(filter) t.receivedSamplesAttribution.DeletePartialMatch(filter) t.discardedSampleAttribution.DeletePartialMatch(filter) } -type Tracker struct { +type TrackerImp struct { userID string - trackedLabels []string + caLabels []string maxCardinality int activeSeriesPerUserAttribution *prometheus.GaugeVec receivedSamplesAttribution *prometheus.CounterVec discardedSampleAttribution *prometheus.CounterVec - // oLock protects the observed map - oLock sync.RWMutex - observed map[uint64]*observation + // obseveredMtx protects the observed map + obseveredMtx sync.RWMutex + observed map[uint64]*Observation hashBuffer []byte } -func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { - vals := t.getKeyValues(lbs, now.Unix()) +func (t *TrackerImp) IncrementActiveSeries(lbs labels.Labels, now time.Time) { + vals := t.getKeyValues(lbs, now.Unix(), nil) t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() } -func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { - vals := t.getKeyValues(lbs, now.Unix()) +func (t *TrackerImp) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { + vals := t.getKeyValues(lbs, now.Unix(), &reason) t.discardedSampleAttribution.WithLabelValues(vals...).Add(value) } -func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { - vals := t.getKeyValues(lbs, now.Unix()) +func (t *TrackerImp) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { + vals := t.getKeyValues(lbs, now.Unix(), nil) t.receivedSamplesAttribution.WithLabelValues(vals...).Add(value) } -func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64) []string { - values := make([]string, len(t.trackedLabels)+1) - for i, l := range t.trackedLabels { +func (t *TrackerImp) getKeyValues(lbls labels.Labels, ts int64, reason *string) []string { + values := make([]string, len(t.caLabels)+2) + for i, l := range t.caLabels { values[i] = lbls.Get(l) if values[i] == "" { values[i] = missingValue } } - values[len(values)-1] = t.userID - + values[len(values)-2] = t.userID + if reason != nil { + values[len(values)-1] = *reason + } var stream uint64 - stream, t.hashBuffer = lbls.HashForLabels(t.hashBuffer, t.trackedLabels...) + stream, t.hashBuffer = lbls.HashForLabels(t.hashBuffer, t.caLabels...) if t.overflow(stream, values, ts) { // Omit last label. - for i := range values[:len(values)-1] { + for i := range values[:len(values)-2] { values[i] = overflowValue } } - + if reason == nil { + return values[:len(values)-1] + } return values } -func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { +func (t *TrackerImp) overflow(stream uint64, values []string, ts int64) bool { // If the maximum cardinality is hit all streams become `__overflow__`. if len(t.observed) > t.maxCardinality { return true @@ -91,7 +114,7 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { if o, known := t.observed[stream]; known && o.lastUpdate != nil && o.lastUpdate.Load() < ts { o.lastUpdate.Store(ts) } else { - t.observed[stream] = &observation{ + t.observed[stream] = &Observation{ lvalues: values, lastUpdate: atomic.NewInt64(ts), } @@ -102,50 +125,51 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { // we need the time stamp, since active series could have entered active stripe long time ago, and already evicted // from the observed map but still in the active Stripe -func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, value int64, ts time.Time) { - vals := t.getKeyValues(lbs, ts.Unix()) +func (t *TrackerImp) DecrementActiveSeries(lbs labels.Labels, ts time.Time) { + vals := t.getKeyValues(lbs, ts.Unix(), nil) t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() } -func newTracker(trackedLabels []string, limit int) (*Tracker, error) { +func newTracker(trackedLabels []string, limit int) (*TrackerImp, error) { // keep tracked labels sorted for consistent metric labels - sort.Strings(trackedLabels) - m := &Tracker{ - trackedLabels: trackedLabels, + sort.Slice(trackedLabels, func(i, j int) bool { + return trackedLabels[i] < trackedLabels[j] + }) + m := &TrackerImp{ + caLabels: trackedLabels, maxCardinality: limit, - oLock: sync.RWMutex{}, - observed: map[uint64]*observation{}, - //nolint:faillint // the metrics are registered in the mimir package + obseveredMtx: sync.RWMutex{}, + observed: map[uint64]*Observation{}, + //lint:ignore faillint the metrics are registered in the mimir package discardedSampleAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_discarded_samples_attribution_total", + Name: "cortex_discarded_attributed_samples_total", Help: "The total number of samples that were discarded per attribution.", - }, append(trackedLabels, "user")), - //nolint:faillint + }, append(trackedLabels, "user", "reason")), + //lint:ignore faillint the metrics are registered in the mimir package receivedSamplesAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_received_samples_attribution_total", + Name: "cortex_received_attributed_samples_total", Help: "The total number of samples that were received per attribution.", }, append(trackedLabels, "user")), - //nolint:faillint + //lint:ignore faillint the metrics are registered in the mimir package activeSeriesPerUserAttribution: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_ingester_active_series_attribution", + Name: "cortex_ingester_attributed_active_series", Help: "The total number of active series per user and attribution.", }, append(trackedLabels, "user")), } return m, nil } -func (t *Tracker) Collect(out chan<- prometheus.Metric) { +func (t *TrackerImp) Collect(out chan<- prometheus.Metric) { t.activeSeriesPerUserAttribution.Collect(out) t.receivedSamplesAttribution.Collect(out) t.discardedSampleAttribution.Collect(out) } // Describe implements prometheus.Collector. -func (t *Tracker) Describe(chan<- *prometheus.Desc) { - // this is an unchecked collector +func (t *TrackerImp) Describe(chan<- *prometheus.Desc) { } -func (t *Tracker) PurgeInactiveObservations(deadline int64) []*observation { +func (t *TrackerImp) PurgeInactiveObservations(deadline int64) []*Observation { obs := t.observed if obs == nil { return nil @@ -162,11 +186,11 @@ func (t *Tracker) PurgeInactiveObservations(deadline int64) []*observation { return nil } - t.oLock.Lock() - defer t.oLock.Unlock() + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() // Cleanup inactive observations and return all invalid observations to clean up metrics for them - res := make([]*observation, len(invalidKeys)) + res := make([]*Observation, len(invalidKeys)) for i := 0; i < len(invalidKeys); { inactiveLab := invalidKeys[i] ob := t.observed[inactiveLab] @@ -183,7 +207,7 @@ func (t *Tracker) PurgeInactiveObservations(deadline int64) []*observation { return res[:len(invalidKeys)] } -func (t *Tracker) updateMaxCardinality(limit int) { +func (t *TrackerImp) UpdateMaxCardinality(limit int) { // if we are reducing limit, we can just set it if t.maxCardinality >= limit { t.maxCardinality = limit @@ -191,10 +215,24 @@ func (t *Tracker) updateMaxCardinality(limit int) { } // if we are increasing limit, we need to check if we are already in overflow, // if yes, reset the counter, otherwise the counters won't be correct - t.oLock.Lock() - defer t.oLock.Unlock() + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() if len(t.observed) > t.maxCardinality { - t.observed = map[uint64]*observation{} + t.observed = map[uint64]*Observation{} } t.maxCardinality = limit } + +type NoopTracker struct{} + +func NewNoopTracker() *NoopTracker { + return &NoopTracker{} +} +func (*NoopTracker) IncrementActiveSeries(labels.Labels, time.Time) {} +func (*NoopTracker) IncrementDiscardedSamples(labels.Labels, float64, string, time.Time) {} +func (*NoopTracker) IncrementReceivedSamples(labels.Labels, float64, time.Time) {} +func (*NoopTracker) DecrementActiveSeries(labels.Labels, time.Time) {} +func (*NoopTracker) PurgeInactiveObservations(int64) []*Observation { return nil } +func (*NoopTracker) UpdateMaxCardinality(int) {} +func (*NoopTracker) GetMaxCardinality() int { return 0 } +func (*NoopTracker) GetCALabels() []string { return nil } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 38063880119..cf580b38f02 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -24,24 +24,24 @@ func Test_NewTracker(t *testing.T) { vals := []string{"foo", "user1"} cat.activeSeriesPerUserAttribution.WithLabelValues(vals...).Set(1.0) cat.receivedSamplesAttribution.WithLabelValues(vals...).Add(5) - cat.discardedSampleAttribution.WithLabelValues(vals...).Add(2) + cat.discardedSampleAttribution.WithLabelValues(append(vals, "out-of-window")...).Add(2) expectedMetrics := ` - # HELP cortex_discarded_samples_attribution_total The total number of samples that were discarded per attribution. - # TYPE cortex_discarded_samples_attribution_total counter - cortex_discarded_samples_attribution_total{platform="foo",user="user1"} 2 - # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. - # TYPE cortex_ingester_active_series_attribution gauge - cortex_ingester_active_series_attribution{platform="foo",user="user1"} 1 - # HELP cortex_received_samples_attribution_total The total number of samples that were received per attribution. - # TYPE cortex_received_samples_attribution_total counter - cortex_received_samples_attribution_total{platform="foo",user="user1"} 5 + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{platform="foo",reason="out-of-window", user="user1"} 2 + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="foo",user="user1"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",user="user1"} 5 ` metricNames := []string{ - "cortex_discarded_samples_attribution_total", - "cortex_received_samples_attribution_total", - "cortex_ingester_active_series_attribution", + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + "cortex_ingester_attributed_active_series", } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 0a5ad882b56..3cb39e2c17c 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1112,9 +1112,7 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { - if cat := getCATrackerForUser(userID, d.costAttributionMgr); cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) - } + getCATrackerForUser(userID, d.costAttributionMgr).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) d.discardedMetadataRateLimited.WithLabelValues(userID).Add(float64(validatedMetadata)) @@ -1679,9 +1677,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - if cat := getCATrackerForUser(userID, d.costAttributionMgr); cat != nil { - cat.IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), now) - } + getCATrackerForUser(userID, d.costAttributionMgr).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), now) } receivedMetadata = len(req.Metadata) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 3296d8ec50a..87da70a2452 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -222,22 +222,18 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() // if the validation failed, we need to increment the discarded samples metric - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -248,29 +244,23 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat costattribution.Tracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidNativeHistogramSchema, now.Time()) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidNativeHistogramSchema, now.Time()) m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -284,9 +274,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -294,9 +282,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -397,38 +383,32 @@ func removeNonASCIIChars(in string) (out string) { // getCATrackerForUser returns the cost attribution tracker for the user. // If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns nil. -func getCATrackerForUser(userID string, cam *costattribution.Manager) *costattribution.Tracker { +func getCATrackerForUser(userID string, cam *costattribution.Manager) costattribution.Tracker { if cam == nil { - return nil + return costattribution.NewNoopTracker() } return cam.TrackerForUser(userID) } // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat costattribution.Tracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMissingMetricName, ts) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMissingMetricName, ts) m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidMetricName, ts) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidMetricName, ts) m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } if !skipLabelCountValidation && len(ls) > cfg.MaxLabelNamesPerSeries(userID) { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerSeries, ts) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -439,29 +419,21 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabel, ts) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabel, ts) return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelNameTooLong, ts) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelNameTooLong, ts) m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Value) > maxLabelValueLength { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelValueTooLong, ts) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelValueTooLong, ts) m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonDuplicateLabelNames, ts) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonDuplicateLabelNames, ts) m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index 71c6e92415f..2ff553a0092 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -19,6 +19,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/validation" ) @@ -198,7 +199,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, nil, ts) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, costattribution.NewNoopTracker(), ts) assert.Equal(t, c.err, err, "wrong error") } @@ -402,7 +403,7 @@ func TestValidateLabelDuplication(t *testing.T) { actual := validateLabels(newSampleValidationMetrics(nil), cfg, userID, "", []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "a"}, {Name: model.MetricNameLabel, Value: "b"}, - }, false, false, nil, ts) + }, false, false, costattribution.NewNoopTracker(), ts) expected := fmt.Errorf( duplicateLabelMsgFormat, model.MetricNameLabel, @@ -419,7 +420,7 @@ func TestValidateLabelDuplication(t *testing.T) { {Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}, {Name: "a", Value: "a"}, - }, false, false, nil, ts) + }, false, false, costattribution.NewNoopTracker(), ts) expected = fmt.Errorf( duplicateLabelMsgFormat, "a", @@ -578,7 +579,7 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { cfg.maxNativeHistogramBuckets = limit ls := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, nil) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, costattribution.NewNoopTracker()) if limit == 1 { require.Error(t, err) @@ -625,7 +626,7 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { hist.Schema = testCase.schema - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, nil) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, costattribution.NewNoopTracker()) require.Equal(t, testCase.expectedError, err) }) } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index 6fdf3e00bc4..0df17809f3b 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -12,6 +12,7 @@ import ( "github.com/prometheus/prometheus/tsdb/index" "github.com/stretchr/testify/require" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -41,7 +42,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) memPostings := index.NewMemPostings() for i, l := range series { diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 2b95020c68d..4467ab6d2ea 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -11,6 +11,7 @@ import ( "github.com/prometheus/prometheus/tsdb/index" "github.com/stretchr/testify/require" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -26,7 +27,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -62,7 +63,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -106,7 +107,7 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -146,7 +147,7 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -183,7 +184,7 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index 84c71634e72..7209a81d29d 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -11,6 +11,7 @@ import ( "github.com/prometheus/prometheus/tsdb/index" "github.com/stretchr/testify/require" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -26,7 +27,7 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -58,7 +59,7 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -90,7 +91,7 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index d827097839f..26a1f579997 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -46,11 +46,11 @@ type ActiveSeries struct { stripes [numStripes]seriesStripe deleted deletedSeries - // matchersMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat - matchersMutex sync.RWMutex - matchers *asmodel.Matchers - cat *costattribution.Tracker - lastMatchersUpdate time.Time + // configMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat + configMutex sync.RWMutex + matchers *asmodel.Matchers + cat costattribution.Tracker + lastConfigUpdate time.Time // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -67,7 +67,7 @@ type seriesStripe struct { // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - cat *costattribution.Tracker + cat costattribution.Tracker mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -76,7 +76,6 @@ type seriesStripe struct { activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. - userID string buf labels.ScratchBuilder } @@ -85,14 +84,14 @@ type seriesEntry struct { nanos *atomic.Int64 // Unix timestamp in nanoseconds. Needs to be a pointer because we don't store pointers to entries in the stripe. matches asmodel.PreAllocDynamicSlice // Index of the matcher matching numNativeHistogramBuckets int // Number of buckets in native histogram series, -1 if not a native histogram. - // keep the value corresponding the label configured in serieStripe + deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } func NewActiveSeries( asm *asmodel.Matchers, timeout time.Duration, - cat *costattribution.Tracker, + cat costattribution.Tracker, ) *ActiveSeries { c := &ActiveSeries{ matchers: asm, timeout: timeout, cat: cat, @@ -107,31 +106,53 @@ func NewActiveSeries( } func (c *ActiveSeries) CurrentMatcherNames() []string { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() return c.matchers.MatcherNames() } +// Function to compare two Tracker instances +func areTrackersEqual(t1, t2 costattribution.Tracker) bool { + if t1 == t2 { + // If both trackers are the same pointer (including nil), they are equal + return true + } + + // Use type assertion to check if both are NoopTracker + _, isNoop1 := t1.(*costattribution.NoopTracker) + _, isNoop2 := t2.(*costattribution.NoopTracker) + + // If both are NoopTracker instances, treat them as equal + return isNoop1 && isNoop2 +} + +func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg costattribution.Tracker) bool { + if ctCfg.String() != c.CurrentConfig().String() { + return true + } + return !areTrackersEqual(caCfg, c.CurrentCostAttributionTracker()) +} + func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() + c.configMutex.Lock() + defer c.configMutex.Unlock() for i := 0; i < numStripes; i++ { c.stripes[i].reinitialize(asm, &c.deleted, c.cat) } c.matchers = asm - c.lastMatchersUpdate = now + c.lastConfigUpdate = now } func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() return c.matchers.Config() } -func (c *ActiveSeries) CurrentCostAttributionTracker() *costattribution.Tracker { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() +func (c *ActiveSeries) CurrentCostAttributionTracker() costattribution.Tracker { + c.configMutex.RLock() + defer c.configMutex.RUnlock() return c.cat } @@ -167,12 +188,12 @@ func (c *ActiveSeries) PostDeletion(deleted map[chunks.HeadSeriesRef]labels.Labe // last reload. This should be called periodically to avoid unbounded memory // growth. func (c *ActiveSeries) Purge(now time.Time, idx tsdb.IndexReader) bool { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() + c.configMutex.Lock() + defer c.configMutex.Unlock() purgeTime := now.Add(-c.timeout) c.purge(purgeTime, idx) - return !c.lastMatchersUpdate.After(purgeTime) + return !c.lastConfigUpdate.After(purgeTime) } // purge removes expired entries from the cache. @@ -214,8 +235,8 @@ func (c *ActiveSeries) Active() (total, totalNativeHistograms, totalNativeHistog // of buckets in those active native histogram series. This method does not purge // expired entries, so Purge should be called periodically. func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, totalNativeHistograms int, totalMatchingNativeHistograms []int, totalNativeHistogramBuckets int, totalMatchingNativeHistogramBuckets []int) { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() totalMatching = make([]int, len(c.matchers.MatcherNames())) totalMatchingNativeHistograms = make([]int, len(c.matchers.MatcherNames())) @@ -415,10 +436,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef // here if we have a cost attribution label, we can split the serie count based on the value of the label // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly - if s.cat != nil { - s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) - } - + s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true } @@ -428,7 +446,6 @@ func (s *seriesStripe) clear() { defer s.mu.Unlock() s.oldestEntryTs.Store(0) - // TODO: s.refs = map[storage.SeriesRef]seriesEntry{} s.active = 0 s.activeNativeHistograms = 0 @@ -444,7 +461,7 @@ func (s *seriesStripe) clear() { func (s *seriesStripe) reinitialize( asm *asmodel.Matchers, deleted *deletedSeries, - cat *costattribution.Tracker, + cat costattribution.Tracker, ) { s.mu.Lock() defer s.mu.Unlock() @@ -480,7 +497,6 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(s.activeMatchingNativeHistogramBuckets), s.activeMatchingNativeHistogramBuckets) oldest := int64(math.MaxInt64) - buf := labels.NewScratchBuilder(128) for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { @@ -488,21 +504,15 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { s.deleted.purge(ref) } - // idx, err := db.Head().Index() - // err = idx.Series(seriesRef, &buf, nil) - // if err != nil { - // return fmt.Errorf("error getting series: %w", err) - // } - // m := &mimirpb.Metric{Labels: mimirpb.FromLabelsToLabelAdapters(buf.Labels())} - - if s.cat != nil && idx != nil { - if err := idx.Series(ref, &buf, nil); err != nil { + if idx != nil { + if err := idx.Series(ref, &s.buf, nil); err != nil { //TODO: think about what to do here + _ = err } - s.cat.DecrementActiveSeries(buf.Labels(), 1, keepUntil) + s.cat.DecrementActiveSeries(s.buf.Labels(), keepUntil) + s.buf.Reset() } delete(s.refs, ref) - // TODO: here need to find what is deleted and decrement counters continue } @@ -550,12 +560,13 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { } s.active-- - if s.cat != nil && idx != nil { + if idx != nil { if err := idx.Series(ref, &s.buf, nil); err != nil { //TODO: think about what to do here _ = err } - s.cat.DecrementActiveSeries(s.buf.Labels(), 1, time.Now()) + s.cat.DecrementActiveSeries(s.buf.Labels(), time.Now()) + defer s.buf.Reset() } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index 0e20ecb6f78..0c8976da536 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -20,6 +20,7 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -38,7 +39,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() @@ -203,7 +204,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { @@ -230,7 +231,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout, nil) + c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) testUpdateSeries(t, c) } @@ -447,7 +448,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout, nil) + c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) testUpdateSeries(t, c) c.Clear() @@ -488,7 +489,7 @@ func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) @@ -516,7 +517,7 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) for i := 0; i < len(series); i++ { c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) @@ -562,7 +563,7 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute, nil) + c := NewActiveSeries(asm, 5*time.Minute, costattribution.NewNoopTracker()) exp := len(series) - ttl expMatchingSeries := 0 @@ -595,7 +596,7 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, nil) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, costattribution.NewNoopTracker()) c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1, nil) c.UpdateSeries(ls2, ref2, currentTime, -1, nil) @@ -631,7 +632,7 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, nil) + c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) valid := c.Purge(currentTime, nil) assert.True(t, valid) @@ -697,7 +698,7 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, nil) + c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -736,7 +737,7 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, nil) + c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -789,7 +790,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0, nil) + c = NewActiveSeries(&asmodel.Matchers{}, 0, costattribution.NewNoopTracker()) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -927,7 +928,7 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout, nil) + c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1, nil) @@ -952,7 +953,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index b0323441234..bccd7fc6a8c 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -774,9 +774,9 @@ func (i *Ingester) replaceMatchers(asm *asmodel.Matchers, userDB *userTSDB, now // getCATrackerForUser returns the cost attribution tracker for the user. // If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns nil. -func getCATrackerForUser(userID string, cam *costattribution.Manager) *costattribution.Tracker { +func getCATrackerForUser(userID string, cam *costattribution.Manager) costattribution.Tracker { if cam == nil { - return nil + return costattribution.NewNoopTracker() } return cam.TrackerForUser(userID) } @@ -790,7 +790,7 @@ func (i *Ingester) updateActiveSeries(now time.Time) { newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) newCostAttributionTracker := getCATrackerForUser(userID, i.costAttributionMgr) - if newMatchersConfig.String() != userDB.activeSeries.CurrentConfig().String() || newCostAttributionTracker != userDB.activeSeries.CurrentCostAttributionTracker() { + if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionTracker) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } @@ -1302,7 +1302,6 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { // Return true if handled as soft error, and we can ingest more series. // get the cost attribution value for the series - cat := getCATrackerForUser(userID, i.costAttributionMgr) handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { @@ -1314,9 +1313,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // we actually ingested all samples which haven't failed. switch { case errors.Is(err, storage.ErrOutOfBounds): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfBounds, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfBounds, startAppend) stats.sampleOutOfBoundsCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) @@ -1324,9 +1321,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrOutOfOrderSample): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) stats.sampleOutOfOrderCount++ updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) @@ -1334,9 +1329,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrTooOldSample): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) stats.sampleTooOldCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) @@ -1344,9 +1337,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.SampleTooFarInFuture): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) stats.sampleTooFarInFutureCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) @@ -1354,9 +1345,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrDuplicateSampleForTimestamp): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) stats.newValueForTimestampCount++ updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) @@ -1364,9 +1353,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.MaxSeriesPerUser): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) stats.perUserSeriesLimitCount++ updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) @@ -1374,9 +1361,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.MaxSeriesPerMetric): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) stats.perMetricSeriesLimitCount++ updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) @@ -1391,45 +1376,35 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre }) return true case errors.Is(err, histogram.ErrHistogramCountMismatch): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramCountNotBigEnough): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramNegativeBucketCount): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramSpanNegativeOffset): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramSpansBucketsMismatch): - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) @@ -1451,8 +1426,12 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels - for _, ts := range timeseries { + // idx is used to decrease active series count in case of error for cost attribution. + idx, _ := i.getTSDB(userID).Head().Index() + // TODO: deal with the error here + + for _, ts := range timeseries { // The labels must be sorted (in our case, it's guaranteed a write request // has sorted labels once hit the ingester). @@ -1468,9 +1447,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleOutOfBoundsCount += len(ts.Samples) + len(ts.Histograms) - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleOutOfBounds, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleOutOfBounds, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1491,9 +1468,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) stats.sampleOutOfBoundsCount += len(ts.Samples) - if cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleOutOfBounds, startAppend) - } + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleOutOfBounds, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -1614,8 +1589,6 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } if activeSeries != nil && stats.succeededSamplesCount > oldSucceededSamplesCount { - idx, _ := i.getTSDB(userID).Head().Index() - // TODO: deal with the error here activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets, idx) } @@ -2709,13 +2682,12 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD ownedSeriedStateShardSize = i.ownedSeriesService.ringStrategy.shardSizeForUser(userID) } - cat := getCATrackerForUser(userID, i.costAttributionMgr) userDB := &userTSDB{ userID: userID, activeSeries: activeseries.NewActiveSeries( asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, - cat, + getCATrackerForUser(userID, i.costAttributionMgr), ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index e9766753525..b4a5dc74baf 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -619,13 +619,13 @@ func (u *userTSDB) computeOwnedSeries() int { } count := 0 + idx, _ := u.Head().Index() + // TODO: deal with the err here u.Head().ForEachSecondaryHash(func(refs []chunks.HeadSeriesRef, secondaryHashes []uint32) { for i, sh := range secondaryHashes { if u.ownedTokenRanges.IncludesKey(sh) { count++ } else { - idx, _ := u.Head().Index() - // TODO: deal with the err here u.activeSeries.Delete(refs[i], idx) } } diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index f3ed3543fa5..b4986401771 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -151,7 +151,6 @@ type Config struct { TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` - CostAttributionCoolDownDuration time.Duration `yaml:"cost_attribution_cool_down_duration" category:"experimental"` } // RegisterFlags registers flags. @@ -174,8 +173,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.StringVar(&c.NoAuthTenant, "auth.no-auth-tenant", "anonymous", "Tenant ID to use when multitenancy is disabled.") f.BoolVar(&c.PrintConfig, "print.config", false, "Print the config and exit.") f.DurationVar(&c.ShutdownDelay, "shutdown-delay", 0, "How long to wait between SIGTERM and shutdown. After receiving SIGTERM, Mimir will report not-ready status via /ready endpoint.") - f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution-eviction-interval", 30*time.Minute, "Time interval at which inactive cost attributions will be evicted from the cache.") - f.DurationVar(&c.CostAttributionCoolDownDuration, "cost-attribution-cool-down-duration", 20*time.Minute, "Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 30*time.Minute, "Time interval at which inactive cost attributions will be evicted from the counter, so it won't be counted when checking max_cost_attribution_cardinality_per_user.") f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 36b60160ad6..af2295048db 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -44,6 +44,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -651,6 +652,7 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { func (t *Mimir) initCostAttributionService() (services.Service, error) { // The cost attribution service is only initilized if the custom registry path is provided. if t.Cfg.CustomRegistryPath != "" { + t.CostAttributionManager = costattribution.NewManager(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides) // if custom registry path is provided, create a custom registry and use it for cost attribution service customRegistry := prometheus.NewRegistry() // Register the custom registry with the provided URL. diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index de374f2b786..1ca2654ad49 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -184,8 +184,8 @@ type Limits struct { ActiveSeriesResultsMaxSizeBytes int `yaml:"active_series_results_max_size_bytes" json:"active_series_results_max_size_bytes" category:"experimental"` // Cost attribution and limit. - CostAttributionLabels flagext.StringSliceCSV `yaml:"cost_attribution_labels" json:"cost_attribution_labels" category:"experimental"` - MaxCostAttributionPerUser int `yaml:"max_cost_attribution_per_user" json:"max_cost_attribution_per_user" category:"experimental"` + CostAttributionLabels flagext.StringSliceCSV `yaml:"cost_attribution_labels" json:"cost_attribution_labels" category:"experimental"` + MaxCostAttributionCardinalityPerUser int `yaml:"max_cost_attribution_cardinality_per_user" json:"max_cost_attribution_cardinality_per_user" category:"experimental"` // Ruler defaults and limits. RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` @@ -294,7 +294,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") f.Var(&l.CostAttributionLabels, "validation.cost-attribution-labels", "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.") - f.IntVar(&l.MaxCostAttributionPerUser, "validation.max-cost-attribution-per-user", 0, "Maximum number of cost attribution labels allowed per user.") + f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 0, "Maximum cardinality of cost attribution labels allowed per user.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -432,6 +432,7 @@ func (l *Limits) unmarshal(decode func(any) error) error { return err } l.extensions = getExtensions() + return l.validate() } @@ -783,12 +784,12 @@ func (o *Overrides) SeparateMetricsGroupLabel(userID string) string { return o.getOverridesForUser(userID).SeparateMetricsGroupLabel } -func (o *Overrides) CostAttributionLabel(userID string) []string { +func (o *Overrides) CostAttributionLabels(userID string) []string { return o.getOverridesForUser(userID).CostAttributionLabels } -func (o *Overrides) MaxCostAttributionPerUser(userID string) int { - return o.getOverridesForUser(userID).MaxCostAttributionPerUser +func (o *Overrides) MaxCostAttributionCardinalityPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionCardinalityPerUser } // IngestionTenantShardSize returns the ingesters shard size for a given user. From cb99a3f409040b1df85437ad9f9bf72e676b35c7 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 25 Oct 2024 17:13:58 +0200 Subject: [PATCH 04/32] fix lint and ci --- .../configuration-parameters/index.md | 26 +++++++++++++++++++ pkg/distributor/distributor.go | 4 ++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 73fb0a7093a..55a4f6fc9a4 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -458,9 +458,21 @@ overrides_exporter: # time. [common: ] +# (advanced) Defines a custom path for the registry. When specified, Mimir will +# expose cost attribution metrics through this custom path, if not specified, +# cost attribution metrics won't be exposed. +# CLI flag: -custom-registry-path +[custom_registry_path: | default = ""] + # (experimental) Enables optimized marshaling of timeseries. # CLI flag: -timeseries-unmarshal-caching-optimization-enabled [timeseries_unmarshal_caching_optimization_enabled: | default = true] + +# (experimental) Time interval at which inactive cost attributions will be +# evicted from the counter, so it won't be counted when checking +# max_cost_attribution_cardinality_per_user. +# CLI flag: -cost-attribution.eviction-interval +[cost_attribution_eviction_interval: | default = 30m] ``` ### common @@ -3527,6 +3539,20 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -querier.active-series-results-max-size-bytes [active_series_results_max_size_bytes: | default = 419430400] +# (experimental) List of labels used to define the cost attribution. This label +# will be included in the specified distributor and ingester metrics for each +# write request, allowing them to be distinguished by the label. The label +# applies to the following metrics: cortex_distributor_received_samples_total, +# cortex_ingester_active_series and cortex_discarded_samples_attribution_total. +# Set to an empty string to disable cost attribution. +# CLI flag: -validation.cost-attribution-labels +[cost_attribution_labels: | default = ""] + +# (experimental) Maximum cardinality of cost attribution labels allowed per +# user. +# CLI flag: -validation.max-cost-attribution-cardinality-per-user +[max_cost_attribution_cardinality_per_user: | default = 0] + # Duration to delay the evaluation of rules to ensure the underlying metrics # have been pushed. # CLI flag: -ruler.evaluation-delay-duration diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 3cb39e2c17c..3a1b27633c7 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1112,7 +1112,9 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { - getCATrackerForUser(userID, d.costAttributionMgr).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + if len(req.Timeseries) > 0 { + getCATrackerForUser(userID, d.costAttributionMgr).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) d.discardedMetadataRateLimited.WithLabelValues(userID).Add(float64(validatedMetadata)) From b0d3f0a3c7c3eb76ecf5d4c442c49997af62c428 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 25 Oct 2024 17:22:39 +0200 Subject: [PATCH 05/32] change max-cost-attribution-cardinality-per-user to 10k --- docs/sources/mimir/configure/configuration-parameters/index.md | 2 +- pkg/util/validation/limits.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 55a4f6fc9a4..f23efe8538d 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3551,7 +3551,7 @@ The `limits` block configures default and per-tenant limits imposed by component # (experimental) Maximum cardinality of cost attribution labels allowed per # user. # CLI flag: -validation.max-cost-attribution-cardinality-per-user -[max_cost_attribution_cardinality_per_user: | default = 0] +[max_cost_attribution_cardinality_per_user: | default = 10000] # Duration to delay the evaluation of rules to ensure the underlying metrics # have been pushed. diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 1ca2654ad49..1c0885c0877 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -294,7 +294,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") f.Var(&l.CostAttributionLabels, "validation.cost-attribution-labels", "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.") - f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 0, "Maximum cardinality of cost attribution labels allowed per user.") + f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") From ebd61053a78881b5639a6c5356d3840de3f8d419 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 25 Oct 2024 19:34:41 +0200 Subject: [PATCH 06/32] change custom registry path Signed-off-by: Ying WANG --- cmd/mimir/config-descriptor.json | 6 +++--- cmd/mimir/help-all.txt.tmpl | 4 ++-- .../configure/configuration-parameters/index.md | 4 ++-- pkg/distributor/distributor.go | 15 ++++++--------- pkg/distributor/validate.go | 3 +-- pkg/ingester/activeseries/active_series.go | 4 ---- pkg/ingester/ingester.go | 11 ++++++----- pkg/mimir/mimir.go | 9 +++++---- pkg/mimir/modules.go | 4 ++-- pkg/util/validation/limits.go | 1 + 10 files changed, 28 insertions(+), 33 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index edc308348fd..e691bba057a 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4364,7 +4364,7 @@ "required": false, "desc": "Maximum cardinality of cost attribution labels allowed per user.", "fieldValue": null, - "fieldDefaultValue": 0, + "fieldDefaultValue": 10000, "fieldFlag": "validation.max-cost-attribution-cardinality-per-user", "fieldType": "int", "fieldCategory": "experimental" @@ -18370,12 +18370,12 @@ }, { "kind": "field", - "name": "custom_registry_path", + "name": "cost_attribution_registry_path", "required": false, "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.", "fieldValue": null, "fieldDefaultValue": "", - "fieldFlag": "custom-registry-path", + "fieldFlag": "cost-attribution.registry-path", "fieldType": "string", "fieldCategory": "advanced" }, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index a5bac0676c5..fd1f6e1295c 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1141,7 +1141,7 @@ Usage of ./cmd/mimir/mimir: Configuration file to load. -cost-attribution.eviction-interval duration [experimental] Time interval at which inactive cost attributions will be evicted from the counter, so it won't be counted when checking max_cost_attribution_cardinality_per_user. (default 30m0s) - -custom-registry-path string + -cost-attribution.registry-path string Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. @@ -3108,7 +3108,7 @@ Usage of ./cmd/mimir/mimir: -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) -validation.max-cost-attribution-cardinality-per-user int - [experimental] Maximum cardinality of cost attribution labels allowed per user. + [experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000) -validation.max-label-names-per-series int Maximum number of label names per series. (default 30) -validation.max-length-label-name int diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index f23efe8538d..21301bcd521 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -461,8 +461,8 @@ overrides_exporter: # (advanced) Defines a custom path for the registry. When specified, Mimir will # expose cost attribution metrics through this custom path, if not specified, # cost attribution metrics won't be exposed. -# CLI flag: -custom-registry-path -[custom_registry_path: | default = ""] +# CLI flag: -cost-attribution.registry-path +[cost_attribution_registry_path: | default = ""] # (experimental) Enables optimized marshaling of timeseries. # CLI flag: -timeseries-unmarshal-caching-optimization-enabled diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 3a1b27633c7..ff6ec129e40 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -106,7 +106,8 @@ type Distributor struct { distributorsLifecycler *ring.BasicLifecycler distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 - costAttributionMgr *costattribution.Manager + + costAttributionMgr *costattribution.Manager // For handling HA replicas. HATracker *haTracker @@ -711,12 +712,13 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) error { - now := model.TimeFromUnixNano(nowt.UnixNano()) cat := getCATrackerForUser(userID, d.costAttributionMgr) if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return err } + now := model.TimeFromUnixNano(nowt.UnixNano()) + for _, s := range ts.Samples { if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err @@ -852,11 +854,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) - if d.costAttributionMgr != nil { - if cat := d.costAttributionMgr.TrackerForUser(userID); cat != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) - } - } + getCATrackerForUser(userID, d.costAttributionMgr).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1673,13 +1671,12 @@ func tokenForMetadata(userID string, metricName string) uint32 { } func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { - now := mtime.Now() var receivedSamples, receivedExemplars, receivedMetadata int for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - getCATrackerForUser(userID, d.costAttributionMgr).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), now) + getCATrackerForUser(userID, d.costAttributionMgr).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) } receivedMetadata = len(req.Metadata) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 87da70a2452..c7daf466f32 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -225,7 +225,6 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() - // if the validation failed, we need to increment the discarded samples metric cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) @@ -382,7 +381,7 @@ func removeNonASCIIChars(in string) (out string) { } // getCATrackerForUser returns the cost attribution tracker for the user. -// If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns nil. +// If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns a noop tracker. func getCATrackerForUser(userID string, cam *costattribution.Manager) costattribution.Tracker { if cam == nil { return costattribution.NewNoopTracker() diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 26a1f579997..1194a21f722 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -167,7 +167,6 @@ func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, c.stripes[deletedStripeID].remove(deleted.ref, idx) } } - } // PostDeletion should be called when series are deleted from the head. @@ -201,7 +200,6 @@ func (c *ActiveSeries) purge(keepUntil time.Time, idx tsdb.IndexReader) { for s := 0; s < numStripes; s++ { c.stripes[s].purge(keepUntil, idx) } - } func (c *ActiveSeries) ContainsRef(ref storage.SeriesRef) bool { @@ -414,7 +412,6 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef matchesLen := matches.Len() s.active++ - if numNativeHistogramBuckets >= 0 { s.activeNativeHistograms++ s.activeNativeHistogramBuckets += uint32(numNativeHistogramBuckets) @@ -521,7 +518,6 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { s.activeNativeHistograms++ s.activeNativeHistogramBuckets += uint32(entry.numNativeHistogramBuckets) } - ml := entry.matches.Len() for i := 0; i < ml; i++ { match := entry.matches.Get(i) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index bccd7fc6a8c..3f701c1eec7 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -378,7 +378,8 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus forceCompactTrigger: make(chan requestWithUsersAndCallback), shipTrigger: make(chan requestWithUsersAndCallback), seriesHashCache: hashcache.NewSeriesHashCache(cfg.BlocksStorageConfig.TSDB.SeriesHashCacheMaxBytes), - errorSamplers: newIngesterErrSamplers(cfg.ErrorSampleRate), + + errorSamplers: newIngesterErrSamplers(cfg.ErrorSampleRate), }, nil } @@ -391,6 +392,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.ingestionRate = util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval) i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService + i.costAttributionMgr = costAttributionMgr // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -773,7 +775,7 @@ func (i *Ingester) replaceMatchers(asm *asmodel.Matchers, userDB *userTSDB, now } // getCATrackerForUser returns the cost attribution tracker for the user. -// If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns nil. +// If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns a noop tracker. func getCATrackerForUser(userID string, cam *costattribution.Manager) costattribution.Tracker { if cam == nil { return costattribution.NewNoopTracker() @@ -1174,8 +1176,7 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // Keep track of some stats which are tracked only if the samples will be // successfully committed - - stats = pushStats{} + stats pushStats firstPartialErr error // updateFirstPartial is a function that, in case of a softError, stores that error @@ -1300,6 +1301,7 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.PreallocTimeseries, app extendedAppender, startAppend time.Time, stats *pushStats, updateFirstPartial func(sampler *util_log.Sampler, errFn softErrorFunction), activeSeries *activeseries.ActiveSeries, outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { + // Return true if handled as soft error, and we can ingest more series. // get the cost attribution value for the series cat := getCATrackerForUser(userID, i.costAttributionMgr) @@ -2703,7 +2705,6 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD localSeriesLimit: initialLocalLimit, }, } - userDB.triggerRecomputeOwnedSeries(recomputeOwnedSeriesReasonNewUser) oooTW := i.limits.OutOfOrderTimeWindow(userID) diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index b4986401771..6b78c476eb5 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -146,8 +146,8 @@ type Config struct { ContinuousTest continuoustest.Config `yaml:"-"` OverridesExporter exporter.Config `yaml:"overrides_exporter"` - Common CommonConfig `yaml:"common"` - CustomRegistryPath string `yaml:"custom_registry_path" category:"advanced"` + Common CommonConfig `yaml:"common"` + CostAttributionRegistryPath string `yaml:"cost_attribution_registry_path" category:"advanced"` TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` @@ -177,7 +177,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") - f.StringVar(&c.CustomRegistryPath, "custom-registry-path", "", "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.") + f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) c.Distributor.RegisterFlags(f, logger) @@ -717,7 +717,6 @@ type Mimir struct { TenantLimits validation.TenantLimits Overrides *validation.Overrides ActiveGroupsCleanup *util.ActiveGroupsCleanupService - CostAttributionManager *costattribution.Manager Distributor *distributor.Distributor Ingester *ingester.Ingester @@ -745,6 +744,8 @@ type Mimir struct { BlockBuilderScheduler *blockbuilderscheduler.BlockBuilderScheduler ContinuousTestManager *continuoustest.Manager BuildInfoHandler http.Handler + + CostAttributionManager *costattribution.Manager } // New makes a new Mimir. diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index af2295048db..721c607fe0f 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -651,14 +651,14 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { func (t *Mimir) initCostAttributionService() (services.Service, error) { // The cost attribution service is only initilized if the custom registry path is provided. - if t.Cfg.CustomRegistryPath != "" { + if t.Cfg.CostAttributionRegistryPath != "" { t.CostAttributionManager = costattribution.NewManager(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides) // if custom registry path is provided, create a custom registry and use it for cost attribution service customRegistry := prometheus.NewRegistry() // Register the custom registry with the provided URL. // This allows users to expose custom metrics on a separate endpoint. // This is useful when users want to expose metrics that are not part of the default Mimir metrics. - http.Handle(t.Cfg.CustomRegistryPath, promhttp.HandlerFor(customRegistry, promhttp.HandlerOpts{Registry: customRegistry})) + http.Handle(t.Cfg.CostAttributionRegistryPath, promhttp.HandlerFor(customRegistry, promhttp.HandlerOpts{Registry: customRegistry})) err := customRegistry.Register(t.CostAttributionManager) return t.CostAttributionManager, err } diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 1c0885c0877..5d85174ce4f 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -293,6 +293,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&l.OutOfOrderBlocksExternalLabelEnabled, "ingester.out-of-order-blocks-external-label-enabled", false, "Whether the shipper should label out-of-order blocks with an external label before uploading them. Setting this label will compact out-of-order blocks separately from non-out-of-order blocks") f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") + f.Var(&l.CostAttributionLabels, "validation.cost-attribution-labels", "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.") f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") From a4383315e43398a36d4e1f0c8dbdf03f8503f0f6 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 25 Oct 2024 19:49:37 +0200 Subject: [PATCH 07/32] Add license for lint Signed-off-by: Ying WANG --- pkg/costattribution/manager.go | 2 ++ pkg/costattribution/manager_test.go | 2 ++ pkg/costattribution/tracker_test.go | 2 ++ 3 files changed, 6 insertions(+) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index c1e3f1d7a94..cb7b3201e71 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: AGPL-3.0-only + package costattribution import ( diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 8bdf56b5bc5..bce902a94dc 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: AGPL-3.0-only + package costattribution // func newTestManager() *Manager { diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index cf580b38f02..7f99e5d1ef8 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: AGPL-3.0-only + package costattribution import ( From 13a0b2c26fceac401119f3905b0ac474a86cd94d Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Sun, 27 Oct 2024 22:08:32 +0100 Subject: [PATCH 08/32] add reset logics to handle overflow and recovery from overflow --- pkg/costattribution/manager.go | 13 +- pkg/costattribution/manager_test.go | 495 +++++++++++++--------------- pkg/costattribution/tracker.go | 67 +++- pkg/costattribution/tracker_test.go | 69 +++- 4 files changed, 344 insertions(+), 300 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index cb7b3201e71..08e33f0d30f 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -47,7 +47,8 @@ func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logge } func (m *Manager) iteration(_ context.Context) error { - m.purgeInactiveAttributions(m.inactiveTimeout) + currentTime := time.Now() + m.purgeInactiveAttributionsUntil(currentTime.Add(-m.inactiveTimeout).Unix()) return nil } @@ -66,7 +67,7 @@ func (m *Manager) TrackerForUser(userID string) Tracker { // if not exists, create a new tracker if _, exists := m.trackersByUserID[userID]; !exists { - m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) + m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) } return m.trackersByUserID[userID] } @@ -96,8 +97,7 @@ func (m *Manager) deleteUserTracer(userID string) { delete(m.trackersByUserID, userID) } -func (m *Manager) purgeInactiveAttributions(inactiveTimeout time.Duration) { - +func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) { // Get all userIDs from the map m.mtx.RLock() userIDs := make([]string, 0, len(m.trackersByUserID)) @@ -107,7 +107,6 @@ func (m *Manager) purgeInactiveAttributions(inactiveTimeout time.Duration) { m.mtx.RUnlock() // Iterate over all userIDs and purge inactive attributions of each user - currentTime := time.Now() for _, userID := range userIDs { // if cost attribution is not enabled for the user, delete the user tracker and continue if len(m.limits.CostAttributionLabels(userID)) == 0 || m.limits.MaxCostAttributionCardinalityPerUser(userID) <= 0 { @@ -115,7 +114,7 @@ func (m *Manager) purgeInactiveAttributions(inactiveTimeout time.Duration) { continue } // get all inactive attributions for the user and clean up the tracker - inactiveObs := m.purgeInactiveObservationsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano()) + inactiveObs := m.purgeInactiveObservationsForUser(userID, deadline) for _, ob := range inactiveObs { m.trackersByUserID[userID].cleanupTrackerAttribution(ob.lvalues) } @@ -149,7 +148,7 @@ func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64 // if they are different, we need to update the tracker, we don't mind, just reinitialized the tracker if !compareStringSlice(cat.GetCALabels(), newTrackedLabels) { m.mtx.Lock() - m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) + m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) // update the tracker with the new tracker cat = m.trackersByUserID[userID] m.mtx.Unlock() diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index bce902a94dc..b25e0c8d926 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -2,275 +2,226 @@ package costattribution -// func newTestManager() *Manager { -// logger := log.NewNopLogger() -// limits, _ := validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(map[string]*validation.Limits{ -// "user1": { -// MaxCostAttributionPerUser: 5, -// CostAttributionLabel: "team", -// }, -// "user2": { -// MaxCostAttributionPerUser: 2, -// CostAttributionLabel: "", -// }, -// "user3": { -// MaxCostAttributionPerUser: 2, -// CostAttributionLabel: "department", -// }, -// })) -// inactiveTimeout := 2 * time.Minute -// cooldownTimeout := 1 * time.Minute -// cleanupInterval := 1 * time.Minute -// return NewManager(cleanupInterval, inactiveTimeout, cooldownTimeout, logger, limits) -// } - -// func Test_NewManager(t *testing.T) { -// manager := newTestManager() -// assert.NotNil(t, manager, "Expected manager to be initialized") -// assert.NotNil(t, manager.attributionTracker, "Expected attribution tracker to be initialized") -// assert.Equal(t, "__overflow__", manager.invalidValue, "Expected invalidValue to be initialized") -// } - -// func Test_EnabledForUser(t *testing.T) { -// manager := newTestManager() -// assert.True(t, manager.EnabledForUser("user1"), "Expected cost attribution to be enabled for user1") -// assert.False(t, manager.EnabledForUser("user2"), "Expected cost attribution to be disabled for user2") -// assert.False(t, manager.EnabledForUser("user4"), "Expected cost attribution to be disabled for user4") -// } - -// func Test_UserAttributionLabel(t *testing.T) { -// manager := newTestManager() -// assert.Equal(t, "team", manager.UserAttributionLabel("user1")) -// assert.Equal(t, "", manager.UserAttributionLabel("user2")) -// assert.Equal(t, "department", manager.UserAttributionLabel("user3")) -// assert.Equal(t, 2, len(manager.attributionTracker.trackersByUserID)) -// assert.Equal(t, "team", manager.attributionTracker.trackersByUserID["user1"].trackedLabel) -// assert.Equal(t, "department", manager.attributionTracker.trackersByUserID["user3"].trackedLabel) -// } - -// func Test_UserAttributionLimit(t *testing.T) { -// manager := newTestManager() -// assert.Equal(t, 5, manager.UserAttributionLimit("user1")) -// assert.Equal(t, 0, manager.UserAttributionLimit("user2")) -// assert.Equal(t, 0, manager.UserAttributionLimit("user4")) -// } - -// func Test_UpdateAttributionTimestamp(t *testing.T) { -// manager := newTestManager() - -// lbls := labels.NewBuilder(labels.EmptyLabels()) -// tm1, tm2, tm3 := "bar", "foo", "baz" -// t.Run("Should update the timestamp when limit not reached for the user attribution", func(t *testing.T) { -// lbls.Set("department", tm1) -// isOutdated, result := manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(0, 0)) -// assert.False(t, isOutdated, "Expected label to be the same as the one in the cache") -// assert.Equal(t, tm1, result, "Expected attribution to be returned since user is enabled for cost attribution, and limit is not reached") -// assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].observed[tm1]) -// assert.Equal(t, int64(0), manager.attributionTracker.trackersByUserID["user3"].observed[tm1].Load()) - -// lbls.Set("department", tm2) -// isOutdated, result = manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(1, 0)) -// assert.False(t, isOutdated) -// assert.Equal(t, tm2, result, "Expected attribution to be returned since user is enabled for cost attribution, and limit is not reached") -// assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].observed[tm2]) -// assert.Equal(t, int64(1), manager.attributionTracker.trackersByUserID["user3"].observed[tm2].Load()) -// }) - -// t.Run("Should only update the timestamp of invalide when limit reached for the user attribution", func(t *testing.T) { -// lbls.Set("department", tm3) -// isOutdated, result := manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(2, 0)) -// assert.False(t, isOutdated) -// assert.Equal(t, manager.invalidValue, result, "Expected invalidValue to be returned since user has reached the limit of cost attribution labels") -// assert.NotNil(t, manager.attributionTracker.trackersByUserID["user3"].observed[manager.invalidValue]) -// assert.Equal(t, int64(2), manager.attributionTracker.trackersByUserID["user3"].observed[manager.invalidValue].Load()) - -// lbls.Set("department", tm1) -// isOutdated, result = manager.UpdateAttributionTimestamp("user3", "department", lbls.Labels(), time.Unix(3, 0)) -// assert.False(t, isOutdated) -// assert.Equal(t, manager.invalidValue, result, "Expected invalidValue to be returned since user has reached the limit of cost attribution labels") -// assert.Equal(t, int64(3), manager.attributionTracker.trackersByUserID["user3"].observed[manager.invalidValue].Load()) -// }) -// } - -// func Test_SetActiveSeries(t *testing.T) { -// manager := newTestManager() -// reg := prometheus.NewRegistry() -// err := reg.Register(manager) -// require.NoError(t, err) -// userID := "user1" - -// lbls := labels.NewBuilder(labels.EmptyLabels()) - -// t.Run("Should set the active series gauge for the given user and attribution", func(t *testing.T) { -// lbls.Set("team", "foo") -// isOutdated, val := manager.UpdateAttributionTimestamp(userID, "team", lbls.Labels(), time.Unix(0, 0)) -// assert.False(t, isOutdated) -// manager.SetActiveSeries(userID, "team", val, 1.0) -// expectedMetrics := ` -// # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. -// # TYPE cortex_ingester_active_series_attribution gauge -// cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 -// ` -// metricNames := []string{ -// "cortex_ingester_active_series_attribution", -// } -// assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) -// }) - -// t.Run("Should set the active series gauge for all users and attributions enabled and ignore disabled user", func(t *testing.T) { -// userID = "user3" -// lbls.Set("department", "bar") -// isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(0, 0)) -// assert.False(t, isOutdated) -// manager.SetActiveSeries(userID, "department", val, 2.0) - -// lbls.Set("department", "baz") -// isOutdated, val = manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(1, 0)) -// assert.False(t, isOutdated) -// manager.SetActiveSeries(userID, "department", val, 3.0) - -// expectedMetrics := ` -// # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. -// # TYPE cortex_ingester_active_series_attribution gauge -// cortex_ingester_active_series_attribution{department="bar",user="user3"} 2 -// cortex_ingester_active_series_attribution{department="baz",user="user3"} 3 -// cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 -// ` -// metricNames := []string{ -// "cortex_ingester_active_series_attribution", -// } -// assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) -// }) - -// t.Run("Cleanup the active series gauge for the given user and attribution when cost attribution disabled", func(t *testing.T) { -// limits := manager.attributionTracker.limits -// defer func() { manager.attributionTracker.limits = limits }() -// userID = "user3" -// lbls.Set("department", "baz") - -// overrides, _ := validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(map[string]*validation.Limits{ -// userID: { -// MaxCostAttributionPerUser: 2, -// CostAttributionLabel: "", -// }, -// })) -// manager.attributionTracker.limits = overrides -// isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(5, 0)) -// assert.False(t, isOutdated) -// manager.SetActiveSeries(userID, val, "department", 3.0) - -// expectedMetrics := ` -// # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. -// # TYPE cortex_ingester_active_series_attribution gauge -// cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 -// ` -// metricNames := []string{ -// "cortex_ingester_active_series_attribution", -// } -// assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) -// }) - -// t.Run("Should ignore setting the active series gauge for disabled user", func(t *testing.T) { -// userID = "user2" -// lbls.Set("department", "bar") -// isOutdated, val := manager.UpdateAttributionTimestamp(userID, "department", lbls.Labels(), time.Unix(0, 0)) -// assert.False(t, isOutdated) -// manager.SetActiveSeries(userID, val, "department", 4.0) - -// expectedMetrics := ` -// # HELP cortex_ingester_active_series_attribution The total number of active series per user and attribution. -// # TYPE cortex_ingester_active_series_attribution gauge -// cortex_ingester_active_series_attribution{team="foo",user="user1"} 1 -// ` -// metricNames := []string{ -// "cortex_ingester_active_series_attribution", -// } -// assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) -// }) -// } - -// func TestUpdateAttributionTimestampForUser(t *testing.T) { -// cooldownTimeout := 10 * time.Second -// t.Run("Should not update the timestamp for the user if attribution lable is not set", func(t *testing.T) { -// // Create mock limits -// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "", MaxCostAttributionPerUser: 5}, nil) -// assert.NoError(t, err) -// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) -// assert.NotNil(t, trackerGroup) - -// ts := time.Unix(1, 0) -// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "platformA", ts) -// trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "teamB", ts) - -// assert.Equal(t, 0, len(trackerGroup.trackersByUserID)) -// }) - -// t.Run("Should not update the timestamp for the user if max cost attribution per user is 0", func(t *testing.T) { -// // Create mock limits -// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 0}, nil) -// assert.NoError(t, err) - -// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) -// assert.NotNil(t, trackerGroup) - -// ts := time.Unix(1, 0) -// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "platformA", ts) -// trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "teamB", ts) - -// assert.Equal(t, 0, len(trackerGroup.trackersByUserID)) -// }) - -// t.Run("Should update the timestamp for the user attribution", func(t *testing.T) { -// // Create mock limits -// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) -// assert.NoError(t, err) - -// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) -// assert.NotNil(t, trackerGroup) - -// ts := time.Unix(1, 0) -// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", ts) -// trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "barA", ts) - -// assert.Equal(t, 2, len(trackerGroup.trackersByUserID)) -// fmt.Println(trackerGroup.trackersByUserID) -// assert.NotNil(t, trackerGroup.trackersByUserID["tenantA"]) -// assert.NotNil(t, trackerGroup.trackersByUserID["tenantA"].observed["fooA"]) -// assert.Equal(t, int64(1), trackerGroup.trackersByUserID["tenantA"].observed["fooA"].Load()) - -// trackerGroup.updateAttributionCacheForUser("tenantB", "platform", "barA", ts.Add(time.Second)) -// assert.Equal(t, int64(2), trackerGroup.trackersByUserID["tenantB"].observed["barA"].Load()) -// }) -// } - -// func TestUserAttributionLabel(t *testing.T) { -// cooldownTimeout := 10 * time.Second -// t.Run("Should return the cost attribution label for the user", func(t *testing.T) { -// // Create mock limits -// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) -// assert.NoError(t, err) - -// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) -// assert.NotNil(t, trackerGroup) -// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", time.Unix(0, 0)) - -// assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) -// }) - -// t.Run("Should return the default cost attribution label for the user if it is in cache", func(t *testing.T) { -// // Create mock limits -// limiter, err := validation.NewOverrides(validation.Limits{CostAttributionLabel: "platform", MaxCostAttributionPerUser: 5}, nil) -// assert.NoError(t, err) - -// trackerGroup := newAttributionTrackerGroup(limiter, cooldownTimeout) -// assert.NotNil(t, trackerGroup) - -// assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) - -// // update the timestamp for the user, so cache is updated -// trackerGroup.updateAttributionCacheForUser("tenantA", "platform", "fooA", time.Unix(0, 0)) - -// // still read the cost attribution label from cache until cache is updated by timed service -// assert.Equal(t, "platform", trackerGroup.getUserAttributionLabelFromCache("tenantA")) -// }) -// } +import ( + "strings" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/grafana/mimir/pkg/util/validation" +) + +func getMockLimits(idx int) (*validation.Overrides, error) { + // Define base limits + baseLimits := map[string]*validation.Limits{ + "user1": { + MaxCostAttributionCardinalityPerUser: 5, + CostAttributionLabels: []string{"team"}, + }, + "user2": { + MaxCostAttributionCardinalityPerUser: 2, + CostAttributionLabels: []string{}, + }, + "user3": { + MaxCostAttributionCardinalityPerUser: 2, + CostAttributionLabels: []string{"department", "service"}, + }, + } + + // Adjust specific cases as needed + switch idx { + case 1: + baseLimits["user1"].CostAttributionLabels = []string{} + case 2: + baseLimits["user3"].CostAttributionLabels = []string{"team", "feature"} + case 3: + baseLimits["user3"].MaxCostAttributionCardinalityPerUser = 3 + case 4: + baseLimits["user1"].MaxCostAttributionCardinalityPerUser = 2 + case 5: + baseLimits["user1"].CostAttributionLabels = []string{"department"} + } + + return validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(baseLimits)) +} + +func newTestManager() *Manager { + logger := log.NewNopLogger() + limits, _ := getMockLimits(0) + inactiveTimeout := 10 * time.Second + cleanupInterval := 5 * time.Second + return NewManager(cleanupInterval, inactiveTimeout, logger, limits) +} + +func Test_NewManager(t *testing.T) { + manager := newTestManager() + assert.NotNil(t, manager, "Expected manager to be initialized") + assert.NotNil(t, manager.trackersByUserID, "Expected attribution tracker to be initialized") + assert.Equal(t, 10*time.Second, manager.inactiveTimeout, "Expected inactiveTimeout to be initialized") +} + +func Test_EnabledForUser(t *testing.T) { + manager := newTestManager() + assert.True(t, manager.EnabledForUser("user1"), "Expected cost attribution to be enabled for user1") + assert.False(t, manager.EnabledForUser("user2"), "Expected cost attribution to be disabled for user2") + assert.False(t, manager.EnabledForUser("user4"), "Expected cost attribution to be disabled for user4") +} + +func Test_CreateDeleteTracker(t *testing.T) { + // Create a new manager and register it with prometheus registry + manager := newTestManager() + reg := prometheus.NewRegistry() + err := reg.Register(manager) + require.NoError(t, err) + + t.Run("Get tracker for user", func(t *testing.T) { + assert.NotNil(t, manager.TrackerForUser("user1").GetCALabels()) + assert.Equal(t, []string{"team"}, manager.TrackerForUser("user1").GetCALabels()) + assert.Equal(t, 5, manager.TrackerForUser("user1").GetMaxCardinality()) + + // user2 is not enabled for cost attribution, so tracker would be a NoopTracker + tr2, ok := manager.TrackerForUser("user2").(*NoopTracker) + assert.True(t, ok) + assert.Equal(t, []string(nil), tr2.GetCALabels()) + + assert.Equal(t, []string{"department", "service"}, manager.TrackerForUser("user3").GetCALabels()) + assert.Equal(t, 2, manager.TrackerForUser("user3").GetMaxCardinality()) + + // user4 tenant config doesn't exist, so tracker would be a NoopTracker + tr4, ok := manager.TrackerForUser("user4").(*NoopTracker) + assert.True(t, ok) + assert.Equal(t, []string(nil), tr4.GetCALabels()) + + assert.Equal(t, 2, len(manager.trackersByUserID)) + }) + + t.Run("Track metrics for enabled user", func(t *testing.T) { + // since user2 is not enabled for cost attribution, tracker would be a NoopTracker, no metrics would be tracked + manager.TrackerForUser("user2").IncrementReceivedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, time.Unix(0, 0)) + + // user1 and user3 is enabled for cost attribution, so metrics would be tracked + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings([]string{"department", "foo"}...), 1, "out-of-window", time.Unix(0, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "dodo"}...), 1, time.Unix(20, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "bar"}...), 1, time.Unix(30, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "far"}...), 1, time.Unix(30, 0)) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",user="user1"} 1 + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="__missing__",user="user3"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",user="user3"} 1 + cortex_received_attributed_samples_total{department="foo",service="bar",user="user3"} 1 + cortex_received_attributed_samples_total{department="foo",service="dodo",user="user3"} 1 + ` + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Purge inactive attributions", func(t *testing.T) { + // Purge inactive attributions until time 10, metrics cortex_discarded_attributed_samples_total of user3 should be deleted + manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + assert.Equal(t, 2, len(manager.trackersByUserID)) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",user="user1"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",user="user3"} 1 + cortex_received_attributed_samples_total{department="foo",service="bar",user="user3"} 1 + cortex_received_attributed_samples_total{department="foo",service="dodo",user="user3"} 1 + ` + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Disable user cost attribution, tracker and metrics are removed", func(t *testing.T) { + // We disable cost attribution for user1, so the tracker should be deleted + manager.limits, err = getMockLimits(1) + assert.NoError(t, err) + + manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + assert.Equal(t, 1, len(manager.trackersByUserID)) + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",user="user3"} 1 + cortex_received_attributed_samples_total{department="foo",service="bar",user="user3"} 1 + cortex_received_attributed_samples_total{department="foo",service="dodo",user="user3"} 1 + ` + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Increase user cost attribution max cardinality, since current state is overflow, we clean up the counter", func(t *testing.T) { + // user3 has cost attribution labels department and service, we change it to team and feature. user1 should not be affected + manager.limits, err = getMockLimits(3) + assert.NoError(t, err) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + assert.Equal(t, 2, len(manager.trackersByUserID)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",user="user1"} 1 + ` + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + } + assert.Equal(t, 3, manager.TrackerForUser("user3").GetMaxCardinality()) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Increase user cost attribution max cardinality, user is not in overflow, nothing changed", func(t *testing.T) { + // user3 has cost attribution labels department and service, we change it to team and feature + manager.limits, err = getMockLimits(4) + assert.NoError(t, err) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, "invalid-metrics-name", time.Unix(13, 0)) + manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + assert.Equal(t, 2, len(manager.trackersByUserID)) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",user="user1"} 2 + ` + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Change user cost attribution lables, tracker and metrics are reinitialized", func(t *testing.T) { + // user3 has cost attribution labels department and service, we change it to team and feature + manager.limits, err = getMockLimits(5) + assert.NoError(t, err) + + manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + assert.Equal(t, 2, len(manager.trackersByUserID)) + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) + }) +} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index f888067426f..96e816bed67 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -61,7 +61,8 @@ type TrackerImp struct { obseveredMtx sync.RWMutex observed map[uint64]*Observation - hashBuffer []byte + hashBuffer []byte + overflowHash uint64 } func (t *TrackerImp) IncrementActiveSeries(lbs labels.Labels, now time.Time) { @@ -79,6 +80,7 @@ func (t *TrackerImp) IncrementReceivedSamples(lbs labels.Labels, value float64, t.receivedSamplesAttribution.WithLabelValues(vals...).Add(value) } +// TODO: bug here, we can update values in the overflow, the reason is that when overflow, we need to change also the values for the overflow hash func (t *TrackerImp) getKeyValues(lbls labels.Labels, ts int64, reason *string) []string { values := make([]string, len(t.caLabels)+2) for i, l := range t.caLabels { @@ -99,6 +101,7 @@ func (t *TrackerImp) getKeyValues(lbls labels.Labels, ts int64, reason *string) values[i] = overflowValue } } + if reason == nil { return values[:len(values)-1] } @@ -106,9 +109,12 @@ func (t *TrackerImp) getKeyValues(lbls labels.Labels, ts int64, reason *string) } func (t *TrackerImp) overflow(stream uint64, values []string, ts int64) bool { - // If the maximum cardinality is hit all streams become `__overflow__`. + // If the maximum cardinality is hit all streams become `__overflow__`, the function would return true. + // the origin labels ovserved time is not updated, but the overflow hash is updated. + isOverflow := false if len(t.observed) > t.maxCardinality { - return true + isOverflow = true + stream = t.overflowHash } if o, known := t.observed[stream]; known && o.lastUpdate != nil && o.lastUpdate.Load() < ts { @@ -120,7 +126,7 @@ func (t *TrackerImp) overflow(stream uint64, values []string, ts int64) bool { } } - return false + return isOverflow } // we need the time stamp, since active series could have entered active stripe long time ago, and already evicted @@ -130,12 +136,13 @@ func (t *TrackerImp) DecrementActiveSeries(lbs labels.Labels, ts time.Time) { t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() } -func newTracker(trackedLabels []string, limit int) (*TrackerImp, error) { +func newTracker(userID string, trackedLabels []string, limit int) (*TrackerImp, error) { // keep tracked labels sorted for consistent metric labels sort.Slice(trackedLabels, func(i, j int) bool { return trackedLabels[i] < trackedLabels[j] }) m := &TrackerImp{ + userID: userID, caLabels: trackedLabels, maxCardinality: limit, obseveredMtx: sync.RWMutex{}, @@ -155,10 +162,21 @@ func newTracker(trackedLabels []string, limit int) (*TrackerImp, error) { Name: "cortex_ingester_attributed_active_series", Help: "The total number of active series per user and attribution.", }, append(trackedLabels, "user")), + hashBuffer: make([]byte, 0, 1024), } + m.updateOverFlowHash() return m, nil } +func (t *TrackerImp) updateOverFlowHash() { + b := labels.NewScratchBuilder(len(t.caLabels)) + for _, lb := range t.caLabels { + b.Add(lb, overflowValue) + } + b.Sort() + t.overflowHash = b.Labels().Hash() +} + func (t *TrackerImp) Collect(out chan<- prometheus.Metric) { t.activeSeriesPerUserAttribution.Collect(out) t.receivedSamplesAttribution.Collect(out) @@ -169,14 +187,33 @@ func (t *TrackerImp) Collect(out chan<- prometheus.Metric) { func (t *TrackerImp) Describe(chan<- *prometheus.Desc) { } +// resetObservedIfNeeded checks if the overflow hash is in the observed map and if it is, when dealine is 0, means that +// we just need to clean up the observed map and metrics without checking the deadline. +// Otherwise, we need to check if the last update time of the overflow hash is less than or equal to the deadline. +// return true if the observed map is cleaned up, otherwise false. +func (t *TrackerImp) resetObservedIfNeeded(deadline int64) bool { + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() + if ob, ok := t.observed[t.overflowHash]; ok { + if deadline == 0 || (ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline) { + t.observed = map[uint64]*Observation{} + t.cleanupTracker(t.userID) + return true + } + } + return false +} + func (t *TrackerImp) PurgeInactiveObservations(deadline int64) []*Observation { - obs := t.observed - if obs == nil { - return nil + // if overflow is in the observed map and it is reached dealine, we need to clean up the observed map and metrics + isReset := t.resetObservedIfNeeded(deadline) + if isReset { + return []*Observation{} } + // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []uint64 - for labHash, ob := range obs { + for labHash, ob := range t.observed { if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { invalidKeys = append(invalidKeys, labHash) } @@ -208,18 +245,14 @@ func (t *TrackerImp) PurgeInactiveObservations(deadline int64) []*Observation { } func (t *TrackerImp) UpdateMaxCardinality(limit int) { - // if we are reducing limit, we can just set it + // if we are reducing limit, we can just set it, if it hits the limit, we can't do much about it. if t.maxCardinality >= limit { t.maxCardinality = limit return } - // if we are increasing limit, we need to check if we are already in overflow, - // if yes, reset the counter, otherwise the counters won't be correct - t.obseveredMtx.Lock() - defer t.obseveredMtx.Unlock() - if len(t.observed) > t.maxCardinality { - t.observed = map[uint64]*Observation{} - } + // if we have hit the limit, we need to clear the observed map. The way to tell that we have hit the limit is + // by checking if the overflow hash is in the observed map. This is handled in the resetObservedIfNeeded function. 0 here means no deadline check is needed. + t.resetObservedIfNeeded(0) t.maxCardinality = limit } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 7f99e5d1ef8..5a7ed7c93d8 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -5,20 +5,24 @@ package costattribution import ( "strings" "testing" + "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func Test_NewTracker(t *testing.T) { - reg := prometheus.NewRegistry() - // Initialize a new Tracker + // Setup the test environment + reg := prometheus.NewRegistry() trackedLabel := []string{"platform"} - cat, err := newTracker(trackedLabel, 5) + cat, err := newTracker("user1", trackedLabel, 5) require.NoError(t, err) + + // Register the metrics err = reg.Register(cat) require.NoError(t, err) @@ -28,6 +32,7 @@ func Test_NewTracker(t *testing.T) { cat.receivedSamplesAttribution.WithLabelValues(vals...).Add(5) cat.discardedSampleAttribution.WithLabelValues(append(vals, "out-of-window")...).Add(2) + // Verify the metrics expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter @@ -47,6 +52,62 @@ func Test_NewTracker(t *testing.T) { } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) - // Clean the tracker for the user attribution + // Clean up the metrics cat.cleanupTrackerAttribution(vals) } + +func Test_PurgeInactiveObservations(t *testing.T) { + // Setup the test environment, user1 cost attribution label is "team", max cardinality limit is 5 + cat := newTestManager().TrackerForUser("user1") + + // create 2 observations + lbs := []labels.Labels{ + labels.FromStrings([]string{"team", "foo"}...), + labels.FromStrings([]string{"team", "bar"}...), + } + cat.IncrementDiscardedSamples(lbs[0], 1, "invalid-metrics-name", time.Unix(1, 0)) + cat.IncrementDiscardedSamples(lbs[1], 2, "out-of-window-sample", time.Unix(12, 0)) + + // Check the observations + require.Len(t, cat.(*TrackerImp).observed, 2) + + // Purge the observations older than 10 seconds, we should have 1 observation left + purged := cat.PurgeInactiveObservations(10) + + // Verify the purged observations + require.Len(t, purged, 1) + assert.Equal(t, int64(1), purged[0].lastUpdate.Load()) + assert.Equal(t, []string{"foo", "user1", "invalid-metrics-name"}, purged[0].lvalues) + + // Verify the remaining observations + obs := cat.(*TrackerImp).observed + require.Len(t, obs, 1) + assert.Equal(t, int64(12), obs[lbs[1].Hash()].lastUpdate.Load()) +} + +func Test_GetMaxCardinality(t *testing.T) { + // Setup the test environment + cat := newTestManager().TrackerForUser("user1") + + // Verify the max cardinality + assert.Equal(t, 5, cat.GetMaxCardinality()) +} + +func Test_GetCALabels(t *testing.T) { + // Setup the test environment + cat := newTestManager().TrackerForUser("user1") + + // Verify the CA labels + assert.Equal(t, []string{"team"}, cat.GetCALabels()) +} + +func Test_UpdateMaxCardinality(t *testing.T) { + // Setup the test environment + cat := newTestManager().TrackerForUser("user1") + + // Update max cardinality + cat.UpdateMaxCardinality(20) + + // Verify the max cardinality + assert.Equal(t, 20, cat.GetMaxCardinality()) +} From 7f0b3722f0f5b0f51397f74505f370f6a12a2ffc Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 5 Nov 2024 20:08:03 +0100 Subject: [PATCH 09/32] remove noop implementation --- pkg/costattribution/manager.go | 35 ++++-- pkg/costattribution/manager_test.go | 14 +-- pkg/costattribution/tracker.go | 112 +++++++++++------- pkg/costattribution/tracker_test.go | 7 +- pkg/distributor/distributor.go | 8 +- pkg/distributor/validate.go | 15 +-- pkg/distributor/validate_test.go | 13 +- .../activeseries/active_labels_test.go | 3 +- .../active_native_histogram_postings_test.go | 11 +- .../activeseries/active_postings_test.go | 7 +- pkg/ingester/activeseries/active_series.go | 35 +++--- .../activeseries/active_series_test.go | 32 +++-- pkg/ingester/ingester.go | 44 +++---- 13 files changed, 174 insertions(+), 162 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 08e33f0d30f..ee3191bee76 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -28,14 +28,14 @@ type Manager struct { // mu protects the trackersByUserID map mtx sync.RWMutex - trackersByUserID map[string]*TrackerImp + trackersByUserID map[string]*Tracker } // NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series. // It will clean up inactive series and update the cost attribution of series every 3 minutes. func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager { s := &Manager{ - trackersByUserID: make(map[string]*TrackerImp), + trackersByUserID: make(map[string]*Tracker), limits: limits, mtx: sync.RWMutex{}, inactiveTimeout: inactiveTimeout, @@ -47,6 +47,9 @@ func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logge } func (m *Manager) iteration(_ context.Context) error { + if m == nil { + return nil + } currentTime := time.Now() m.purgeInactiveAttributionsUntil(currentTime.Add(-m.inactiveTimeout).Unix()) return nil @@ -54,14 +57,18 @@ func (m *Manager) iteration(_ context.Context) error { // EnabledForUser returns true if the cost attribution is enabled for the user func (m *Manager) EnabledForUser(userID string) bool { + if m == nil { + return false + } return len(m.limits.CostAttributionLabels(userID)) > 0 } -func (m *Manager) TrackerForUser(userID string) Tracker { - // if cost attribution is not enabled, return nil - if !m.EnabledForUser(userID) { - return NewNoopTracker() +func (m *Manager) TrackerForUser(userID string) *Tracker { + // if manager is not initialized or cost attribution is not enabled, return nil + if m == nil || !m.EnabledForUser(userID) { + return nil } + m.mtx.Lock() defer m.mtx.Unlock() @@ -73,6 +80,9 @@ func (m *Manager) TrackerForUser(userID string) Tracker { } func (m *Manager) Collect(out chan<- prometheus.Metric) { + if m == nil { + return + } m.mtx.RLock() defer m.mtx.RUnlock() for _, tracker := range m.trackersByUserID { @@ -87,6 +97,9 @@ func (m *Manager) Describe(chan<- *prometheus.Desc) { // deleteUserTracer is delete user tracker since the user is disabled for cost attribution func (m *Manager) deleteUserTracer(userID string) { + if m == nil { + return + } m.mtx.Lock() defer m.mtx.Unlock() if _, exists := m.trackersByUserID[userID]; !exists { @@ -98,6 +111,9 @@ func (m *Manager) deleteUserTracer(userID string) { } func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) { + if m == nil { + return + } // Get all userIDs from the map m.mtx.RLock() userIDs := make([]string, 0, len(m.trackersByUserID)) @@ -135,9 +151,12 @@ func compareStringSlice(a, b []string) bool { } func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*Observation { + if m == nil { + return nil + } + cat := m.TrackerForUser(userID) - if _, ok := cat.(*NoopTracker); ok { - // It's a noop implementation + if cat == nil { return nil } diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index b25e0c8d926..78c774b2908 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -85,24 +85,24 @@ func Test_CreateDeleteTracker(t *testing.T) { assert.Equal(t, []string{"team"}, manager.TrackerForUser("user1").GetCALabels()) assert.Equal(t, 5, manager.TrackerForUser("user1").GetMaxCardinality()) - // user2 is not enabled for cost attribution, so tracker would be a NoopTracker - tr2, ok := manager.TrackerForUser("user2").(*NoopTracker) - assert.True(t, ok) + // user2 is not enabled for cost attribution, so tracker would be nil + tr2 := manager.TrackerForUser("user2") + assert.Nil(t, tr2) assert.Equal(t, []string(nil), tr2.GetCALabels()) assert.Equal(t, []string{"department", "service"}, manager.TrackerForUser("user3").GetCALabels()) assert.Equal(t, 2, manager.TrackerForUser("user3").GetMaxCardinality()) - // user4 tenant config doesn't exist, so tracker would be a NoopTracker - tr4, ok := manager.TrackerForUser("user4").(*NoopTracker) - assert.True(t, ok) + // user4 tenant config doesn't exist, so tracker would be nil + tr4 := manager.TrackerForUser("user4") + assert.Nil(t, tr4) assert.Equal(t, []string(nil), tr4.GetCALabels()) assert.Equal(t, 2, len(manager.trackersByUserID)) }) t.Run("Track metrics for enabled user", func(t *testing.T) { - // since user2 is not enabled for cost attribution, tracker would be a NoopTracker, no metrics would be tracked + // since user2 is not enabled for cost attribution, tracker would be nil, no metrics would be tracked manager.TrackerForUser("user2").IncrementReceivedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, time.Unix(0, 0)) // user1 and user3 is enabled for cost attribution, so metrics would be tracked diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 96e816bed67..e9ac98a68ab 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -12,44 +12,45 @@ import ( "go.uber.org/atomic" ) -type Tracker interface { - IncrementActiveSeries(labels.Labels, time.Time) - IncrementDiscardedSamples(labels.Labels, float64, string, time.Time) - IncrementReceivedSamples(labels.Labels, float64, time.Time) - DecrementActiveSeries(labels.Labels, time.Time) - PurgeInactiveObservations(int64) []*Observation - UpdateMaxCardinality(int) - GetMaxCardinality() int - GetCALabels() []string -} - type Observation struct { lvalues []string lastUpdate *atomic.Int64 } -func (t *TrackerImp) GetCALabels() []string { +func (t *Tracker) GetCALabels() []string { + if t == nil { + return nil + } return t.caLabels } -func (t *TrackerImp) GetMaxCardinality() int { +func (t *Tracker) GetMaxCardinality() int { + if t == nil { + return 0 + } return t.maxCardinality } -func (t *TrackerImp) cleanupTrackerAttribution(vals []string) { +func (t *Tracker) cleanupTrackerAttribution(vals []string) { + if t == nil { + return + } t.activeSeriesPerUserAttribution.DeleteLabelValues(vals...) t.receivedSamplesAttribution.DeleteLabelValues(vals...) t.discardedSampleAttribution.DeleteLabelValues(vals...) } -func (t *TrackerImp) cleanupTracker(userID string) { +func (t *Tracker) cleanupTracker(userID string) { + if t == nil { + return + } filter := prometheus.Labels{"user": userID} t.activeSeriesPerUserAttribution.DeletePartialMatch(filter) t.receivedSamplesAttribution.DeletePartialMatch(filter) t.discardedSampleAttribution.DeletePartialMatch(filter) } -type TrackerImp struct { +type Tracker struct { userID string caLabels []string maxCardinality int @@ -65,23 +66,35 @@ type TrackerImp struct { overflowHash uint64 } -func (t *TrackerImp) IncrementActiveSeries(lbs labels.Labels, now time.Time) { +func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } vals := t.getKeyValues(lbs, now.Unix(), nil) t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() } -func (t *TrackerImp) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { +func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { + if t == nil { + return + } vals := t.getKeyValues(lbs, now.Unix(), &reason) t.discardedSampleAttribution.WithLabelValues(vals...).Add(value) } -func (t *TrackerImp) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { +func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { + if t == nil { + return + } vals := t.getKeyValues(lbs, now.Unix(), nil) t.receivedSamplesAttribution.WithLabelValues(vals...).Add(value) } // TODO: bug here, we can update values in the overflow, the reason is that when overflow, we need to change also the values for the overflow hash -func (t *TrackerImp) getKeyValues(lbls labels.Labels, ts int64, reason *string) []string { +func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64, reason *string) []string { + if t == nil { + return nil + } values := make([]string, len(t.caLabels)+2) for i, l := range t.caLabels { values[i] = lbls.Get(l) @@ -108,7 +121,10 @@ func (t *TrackerImp) getKeyValues(lbls labels.Labels, ts int64, reason *string) return values } -func (t *TrackerImp) overflow(stream uint64, values []string, ts int64) bool { +func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { + if t == nil { + return false + } // If the maximum cardinality is hit all streams become `__overflow__`, the function would return true. // the origin labels ovserved time is not updated, but the overflow hash is updated. isOverflow := false @@ -131,17 +147,20 @@ func (t *TrackerImp) overflow(stream uint64, values []string, ts int64) bool { // we need the time stamp, since active series could have entered active stripe long time ago, and already evicted // from the observed map but still in the active Stripe -func (t *TrackerImp) DecrementActiveSeries(lbs labels.Labels, ts time.Time) { +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, ts time.Time) { + if t == nil { + return + } vals := t.getKeyValues(lbs, ts.Unix(), nil) t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() } -func newTracker(userID string, trackedLabels []string, limit int) (*TrackerImp, error) { +func newTracker(userID string, trackedLabels []string, limit int) (*Tracker, error) { // keep tracked labels sorted for consistent metric labels sort.Slice(trackedLabels, func(i, j int) bool { return trackedLabels[i] < trackedLabels[j] }) - m := &TrackerImp{ + m := &Tracker{ userID: userID, caLabels: trackedLabels, maxCardinality: limit, @@ -168,7 +187,10 @@ func newTracker(userID string, trackedLabels []string, limit int) (*TrackerImp, return m, nil } -func (t *TrackerImp) updateOverFlowHash() { +func (t *Tracker) updateOverFlowHash() { + if t == nil { + return + } b := labels.NewScratchBuilder(len(t.caLabels)) for _, lb := range t.caLabels { b.Add(lb, overflowValue) @@ -177,21 +199,31 @@ func (t *TrackerImp) updateOverFlowHash() { t.overflowHash = b.Labels().Hash() } -func (t *TrackerImp) Collect(out chan<- prometheus.Metric) { +func (t *Tracker) Collect(out chan<- prometheus.Metric) { + if t == nil { + return + } t.activeSeriesPerUserAttribution.Collect(out) t.receivedSamplesAttribution.Collect(out) t.discardedSampleAttribution.Collect(out) } // Describe implements prometheus.Collector. -func (t *TrackerImp) Describe(chan<- *prometheus.Desc) { +func (t *Tracker) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector + if t == nil { + return + } } // resetObservedIfNeeded checks if the overflow hash is in the observed map and if it is, when dealine is 0, means that // we just need to clean up the observed map and metrics without checking the deadline. // Otherwise, we need to check if the last update time of the overflow hash is less than or equal to the deadline. // return true if the observed map is cleaned up, otherwise false. -func (t *TrackerImp) resetObservedIfNeeded(deadline int64) bool { +func (t *Tracker) resetObservedIfNeeded(deadline int64) bool { + if t == nil { + return false + } t.obseveredMtx.Lock() defer t.obseveredMtx.Unlock() if ob, ok := t.observed[t.overflowHash]; ok { @@ -204,7 +236,10 @@ func (t *TrackerImp) resetObservedIfNeeded(deadline int64) bool { return false } -func (t *TrackerImp) PurgeInactiveObservations(deadline int64) []*Observation { +func (t *Tracker) PurgeInactiveObservations(deadline int64) []*Observation { + if t == nil { + return nil + } // if overflow is in the observed map and it is reached dealine, we need to clean up the observed map and metrics isReset := t.resetObservedIfNeeded(deadline) if isReset { @@ -244,7 +279,10 @@ func (t *TrackerImp) PurgeInactiveObservations(deadline int64) []*Observation { return res[:len(invalidKeys)] } -func (t *TrackerImp) UpdateMaxCardinality(limit int) { +func (t *Tracker) UpdateMaxCardinality(limit int) { + if t == nil { + return + } // if we are reducing limit, we can just set it, if it hits the limit, we can't do much about it. if t.maxCardinality >= limit { t.maxCardinality = limit @@ -255,17 +293,3 @@ func (t *TrackerImp) UpdateMaxCardinality(limit int) { t.resetObservedIfNeeded(0) t.maxCardinality = limit } - -type NoopTracker struct{} - -func NewNoopTracker() *NoopTracker { - return &NoopTracker{} -} -func (*NoopTracker) IncrementActiveSeries(labels.Labels, time.Time) {} -func (*NoopTracker) IncrementDiscardedSamples(labels.Labels, float64, string, time.Time) {} -func (*NoopTracker) IncrementReceivedSamples(labels.Labels, float64, time.Time) {} -func (*NoopTracker) DecrementActiveSeries(labels.Labels, time.Time) {} -func (*NoopTracker) PurgeInactiveObservations(int64) []*Observation { return nil } -func (*NoopTracker) UpdateMaxCardinality(int) {} -func (*NoopTracker) GetMaxCardinality() int { return 0 } -func (*NoopTracker) GetCALabels() []string { return nil } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 5a7ed7c93d8..e91f86328f5 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -69,7 +69,7 @@ func Test_PurgeInactiveObservations(t *testing.T) { cat.IncrementDiscardedSamples(lbs[1], 2, "out-of-window-sample", time.Unix(12, 0)) // Check the observations - require.Len(t, cat.(*TrackerImp).observed, 2) + require.Len(t, cat.observed, 2) // Purge the observations older than 10 seconds, we should have 1 observation left purged := cat.PurgeInactiveObservations(10) @@ -80,9 +80,8 @@ func Test_PurgeInactiveObservations(t *testing.T) { assert.Equal(t, []string{"foo", "user1", "invalid-metrics-name"}, purged[0].lvalues) // Verify the remaining observations - obs := cat.(*TrackerImp).observed - require.Len(t, obs, 1) - assert.Equal(t, int64(12), obs[lbs[1].Hash()].lastUpdate.Load()) + require.Len(t, cat.observed, 1) + assert.Equal(t, int64(12), cat.observed[lbs[1].Hash()].lastUpdate.Load()) } func Test_GetMaxCardinality(t *testing.T) { diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index ff6ec129e40..dea0badceb0 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -712,7 +712,7 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) error { - cat := getCATrackerForUser(userID, d.costAttributionMgr) + cat := d.costAttributionMgr.TrackerForUser(userID) if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return err } @@ -854,7 +854,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) - getCATrackerForUser(userID, d.costAttributionMgr).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1111,7 +1111,7 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { if len(req.Timeseries) > 0 { - getCATrackerForUser(userID, d.costAttributionMgr).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) @@ -1676,7 +1676,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - getCATrackerForUser(userID, d.costAttributionMgr).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) + d.costAttributionMgr.TrackerForUser(userID).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) } receivedMetadata = len(req.Metadata) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index c7daf466f32..4536c848123 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -222,7 +222,7 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat costattribution.Tracker) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) @@ -243,7 +243,7 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat costattribution.Tracker) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() @@ -380,18 +380,9 @@ func removeNonASCIIChars(in string) (out string) { return out } -// getCATrackerForUser returns the cost attribution tracker for the user. -// If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns a noop tracker. -func getCATrackerForUser(userID string, cam *costattribution.Manager) costattribution.Tracker { - if cam == nil { - return costattribution.NewNoopTracker() - } - return cam.TrackerForUser(userID) -} - // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat costattribution.Tracker, ts time.Time) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMissingMetricName, ts) diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index 2ff553a0092..4d8fa7a727b 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -19,7 +19,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/validation" ) @@ -199,7 +198,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, costattribution.NewNoopTracker(), ts) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, nil, ts) assert.Equal(t, c.err, err, "wrong error") } @@ -399,11 +398,10 @@ func TestValidateLabelDuplication(t *testing.T) { cfg.maxLabelValueLength = 10 userID := "testUser" - actual := validateLabels(newSampleValidationMetrics(nil), cfg, userID, "", []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "a"}, {Name: model.MetricNameLabel, Value: "b"}, - }, false, false, costattribution.NewNoopTracker(), ts) + }, false, false, nil, ts) expected := fmt.Errorf( duplicateLabelMsgFormat, model.MetricNameLabel, @@ -420,7 +418,7 @@ func TestValidateLabelDuplication(t *testing.T) { {Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}, {Name: "a", Value: "a"}, - }, false, false, costattribution.NewNoopTracker(), ts) + }, false, false, nil, ts) expected = fmt.Errorf( duplicateLabelMsgFormat, "a", @@ -571,7 +569,6 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { registry := prometheus.NewRegistry() metrics := newSampleValidationMetrics(registry) - for _, limit := range []int{0, 1, 2} { for name, h := range testCases { t.Run(fmt.Sprintf("limit-%d-%s", limit, name), func(t *testing.T) { @@ -579,7 +576,7 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { cfg.maxNativeHistogramBuckets = limit ls := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, costattribution.NewNoopTracker()) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, nil) if limit == 1 { require.Error(t, err) @@ -626,7 +623,7 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { hist.Schema = testCase.schema - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, costattribution.NewNoopTracker()) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, nil) require.Equal(t, testCase.expectedError, err) }) } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index 0df17809f3b..6fdf3e00bc4 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -12,7 +12,6 @@ import ( "github.com/prometheus/prometheus/tsdb/index" "github.com/stretchr/testify/require" - "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -42,7 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) memPostings := index.NewMemPostings() for i, l := range series { diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 4467ab6d2ea..2b95020c68d 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -11,7 +11,6 @@ import ( "github.com/prometheus/prometheus/tsdb/index" "github.com/stretchr/testify/require" - "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -27,7 +26,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -63,7 +62,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -107,7 +106,7 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -147,7 +146,7 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -184,7 +183,7 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index 7209a81d29d..84c71634e72 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -11,7 +11,6 @@ import ( "github.com/prometheus/prometheus/tsdb/index" "github.com/stretchr/testify/require" - "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -27,7 +26,7 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -59,7 +58,7 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -91,7 +90,7 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), costattribution.NewNoopTracker()) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 1194a21f722..9ee8f4909d3 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -49,7 +49,7 @@ type ActiveSeries struct { // configMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat configMutex sync.RWMutex matchers *asmodel.Matchers - cat costattribution.Tracker + cat *costattribution.Tracker lastConfigUpdate time.Time // The duration after which series become inactive. @@ -67,7 +67,7 @@ type seriesStripe struct { // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - cat costattribution.Tracker + cat *costattribution.Tracker mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -91,7 +91,7 @@ type seriesEntry struct { func NewActiveSeries( asm *asmodel.Matchers, timeout time.Duration, - cat costattribution.Tracker, + cat *costattribution.Tracker, ) *ActiveSeries { c := &ActiveSeries{ matchers: asm, timeout: timeout, cat: cat, @@ -112,24 +112,25 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { } // Function to compare two Tracker instances -func areTrackersEqual(t1, t2 costattribution.Tracker) bool { - if t1 == t2 { - // If both trackers are the same pointer (including nil), they are equal - return true +func areTrackersEqual(t1, t2 *costattribution.Tracker) bool { + cal1 := t1.GetCALabels() + cal2 := t2.GetCALabels() + if len(cal1) != len(cal2) { + return false + } + for i := range cal1 { + if cal1[i] != cal2[i] { + return false + } } - - // Use type assertion to check if both are NoopTracker - _, isNoop1 := t1.(*costattribution.NoopTracker) - _, isNoop2 := t2.(*costattribution.NoopTracker) - - // If both are NoopTracker instances, treat them as equal - return isNoop1 && isNoop2 + return true } -func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg costattribution.Tracker) bool { +func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { if ctCfg.String() != c.CurrentConfig().String() { return true } + return !areTrackersEqual(caCfg, c.CurrentCostAttributionTracker()) } @@ -150,7 +151,7 @@ func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { return c.matchers.Config() } -func (c *ActiveSeries) CurrentCostAttributionTracker() costattribution.Tracker { +func (c *ActiveSeries) CurrentCostAttributionTracker() *costattribution.Tracker { c.configMutex.RLock() defer c.configMutex.RUnlock() return c.cat @@ -458,7 +459,7 @@ func (s *seriesStripe) clear() { func (s *seriesStripe) reinitialize( asm *asmodel.Matchers, deleted *deletedSeries, - cat costattribution.Tracker, + cat *costattribution.Tracker, ) { s.mu.Lock() defer s.mu.Unlock() diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index 0c8976da536..ca36450f823 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -20,7 +20,6 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/atomic" - "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -38,8 +37,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref3, ls3 := storage.SeriesRef(3), labels.FromStrings("a", "3") ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() @@ -204,7 +202,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { @@ -231,7 +229,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) } @@ -448,7 +446,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) c.Clear() @@ -488,8 +486,7 @@ func labelsWithHashCollision() (labels.Labels, labels.Labels) { func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) - - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) @@ -517,7 +514,7 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) for i := 0; i < len(series); i++ { c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) @@ -563,7 +560,7 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute, costattribution.NewNoopTracker()) + c := NewActiveSeries(asm, 5*time.Minute, nil) exp := len(series) - ttl expMatchingSeries := 0 @@ -596,7 +593,7 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, costattribution.NewNoopTracker()) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, nil) c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1, nil) c.UpdateSeries(ls2, ref2, currentTime, -1, nil) @@ -632,7 +629,7 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(asm, DefaultTimeout, nil) valid := c.Purge(currentTime, nil) assert.True(t, valid) @@ -698,7 +695,7 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(asm, DefaultTimeout, nil) valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -736,8 +733,7 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) currentTime := time.Now() - - c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(asm, DefaultTimeout, nil) valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() @@ -790,7 +786,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0, costattribution.NewNoopTracker()) + c = NewActiveSeries(&asmodel.Matchers{}, 0, nil) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -928,7 +924,7 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(asm, DefaultTimeout, nil) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1, nil) @@ -953,7 +949,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, costattribution.NewNoopTracker()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 3f701c1eec7..d061d515723 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -774,15 +774,6 @@ func (i *Ingester) replaceMatchers(asm *asmodel.Matchers, userDB *userTSDB, now userDB.activeSeries.ReloadMatchers(asm, now) } -// getCATrackerForUser returns the cost attribution tracker for the user. -// If the cost attribution manager is nil or the user is not enabled for cost attribution, it returns a noop tracker. -func getCATrackerForUser(userID string, cam *costattribution.Manager) costattribution.Tracker { - if cam == nil { - return costattribution.NewNoopTracker() - } - return cam.TrackerForUser(userID) -} - func (i *Ingester) updateActiveSeries(now time.Time) { for _, userID := range i.getTSDBUsers() { userDB := i.getTSDB(userID) @@ -791,7 +782,7 @@ func (i *Ingester) updateActiveSeries(now time.Time) { } newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) - newCostAttributionTracker := getCATrackerForUser(userID, i.costAttributionMgr) + newCostAttributionTracker := i.costAttributionMgr.TrackerForUser(userID) if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionTracker) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } @@ -1303,9 +1294,6 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { // Return true if handled as soft error, and we can ingest more series. - // get the cost attribution value for the series - cat := getCATrackerForUser(userID, i.costAttributionMgr) - handleAppendError := func(err error, timestamp int64, labels []mimirpb.LabelAdapter) bool { stats.failedSamplesCount++ @@ -1315,7 +1303,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // we actually ingested all samples which haven't failed. switch { case errors.Is(err, storage.ErrOutOfBounds): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfBounds, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfBounds, startAppend) stats.sampleOutOfBoundsCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) @@ -1323,7 +1311,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrOutOfOrderSample): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) stats.sampleOutOfOrderCount++ updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) @@ -1331,7 +1319,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrTooOldSample): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) stats.sampleTooOldCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) @@ -1339,7 +1327,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.SampleTooFarInFuture): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) stats.sampleTooFarInFutureCount++ updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) @@ -1347,7 +1335,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, storage.ErrDuplicateSampleForTimestamp): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) stats.newValueForTimestampCount++ updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) @@ -1355,7 +1343,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.MaxSeriesPerUser): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) stats.perUserSeriesLimitCount++ updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) @@ -1363,7 +1351,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return true case errors.Is(err, globalerror.MaxSeriesPerMetric): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) stats.perMetricSeriesLimitCount++ updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) @@ -1378,35 +1366,35 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre }) return true case errors.Is(err, histogram.ErrHistogramCountMismatch): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramCountNotBigEnough): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramNegativeBucketCount): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramSpanNegativeOffset): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) return true case errors.Is(err, histogram.ErrHistogramSpansBucketsMismatch): - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) stats.invalidNativeHistogramCount++ updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) @@ -1449,7 +1437,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleOutOfBoundsCount += len(ts.Samples) + len(ts.Histograms) - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleOutOfBounds, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleOutOfBounds, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1470,7 +1458,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) stats.sampleOutOfBoundsCount += len(ts.Samples) - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleOutOfBounds, startAppend) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleOutOfBounds, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -2689,7 +2677,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD activeSeries: activeseries.NewActiveSeries( asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, - getCATrackerForUser(userID, i.costAttributionMgr), + i.costAttributionMgr.TrackerForUser(userID), ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), From 50fa0eca5ac85745f60589a764a0e40ada5dcea5 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 5 Nov 2024 20:44:16 +0100 Subject: [PATCH 10/32] add new discarded sample metrics --- pkg/blockbuilder/tsdb.go | 2 +- pkg/ingester/ingester.go | 15 ++++++++++++++- pkg/storage/soft_append_error_processor.go | 6 +++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pkg/blockbuilder/tsdb.go b/pkg/blockbuilder/tsdb.go index ab865f39a5e..77a54d809fe 100644 --- a/pkg/blockbuilder/tsdb.go +++ b/pkg/blockbuilder/tsdb.go @@ -48,7 +48,7 @@ type TSDBBuilder struct { var softErrProcessor = mimir_storage.NewSoftAppendErrorProcessor( func() {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, - func() {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, + func([]mimirpb.LabelAdapter) {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, ) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 14b5d633acc..0513bae7161 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1198,48 +1198,56 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfBoundsCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfBounds, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooOldCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooFarInFutureCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.newValueForTimestampCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) }) }, - func() { + func(labels []mimirpb.LabelAdapter) { stats.perUserSeriesLimitCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) }) }, func(labels []mimirpb.LabelAdapter) { stats.perMetricSeriesLimitCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { e := newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) return e @@ -1247,30 +1255,35 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) }) diff --git a/pkg/storage/soft_append_error_processor.go b/pkg/storage/soft_append_error_processor.go index 0f02131537d..d8df5714361 100644 --- a/pkg/storage/soft_append_error_processor.go +++ b/pkg/storage/soft_append_error_processor.go @@ -22,7 +22,7 @@ type SoftAppendErrorProcessor struct { errTooOldSample func(int64, []mimirpb.LabelAdapter) sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter) errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter) - maxSeriesPerUser func() + maxSeriesPerUser func([]mimirpb.LabelAdapter) maxSeriesPerMetric func(labels []mimirpb.LabelAdapter) errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter) errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter) @@ -39,7 +39,7 @@ func NewSoftAppendErrorProcessor( errTooOldSample func(int64, []mimirpb.LabelAdapter), sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter), errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter), - maxSeriesPerUser func(), + maxSeriesPerUser func([]mimirpb.LabelAdapter), maxSeriesPerMetric func(labels []mimirpb.LabelAdapter), errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter), errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter), @@ -89,7 +89,7 @@ func (e *SoftAppendErrorProcessor) ProcessErr(err error, ts int64, labels []mimi e.errDuplicateSampleForTimestamp(ts, labels) return true case errors.Is(err, globalerror.MaxSeriesPerUser): - e.maxSeriesPerUser() + e.maxSeriesPerUser(labels) return true case errors.Is(err, globalerror.MaxSeriesPerMetric): e.maxSeriesPerMetric(labels) From abdd0cc905227e128d1b988fd49cd58e3d394cad Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 5 Nov 2024 21:01:19 +0100 Subject: [PATCH 11/32] fix test --- pkg/costattribution/tracker_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index e91f86328f5..d35ec6e4f1c 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -81,7 +81,7 @@ func Test_PurgeInactiveObservations(t *testing.T) { // Verify the remaining observations require.Len(t, cat.observed, 1) - assert.Equal(t, int64(12), cat.observed[lbs[1].Hash()].lastUpdate.Load()) + assert.NotNil(t, cat.observed[lbs[1].Hash()].lastUpdate) } func Test_GetMaxCardinality(t *testing.T) { From 698a5c6b682f235a88584e6b55729419ea394a56 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 6 Nov 2024 14:06:08 +0100 Subject: [PATCH 12/32] address comment to combine 2 config compare --- pkg/costattribution/manager.go | 8 ++-- pkg/costattribution/manager_test.go | 44 +++++++++++----------- pkg/costattribution/tracker.go | 33 +++++++++------- pkg/costattribution/tracker_test.go | 12 +++--- pkg/ingester/activeseries/active_series.go | 32 ++-------------- 5 files changed, 57 insertions(+), 72 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index ee3191bee76..9bf878dbbe6 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -138,7 +138,8 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) { } // compare two sorted string slices -func compareStringSlice(a, b []string) bool { +// true if they are equal, otherwise false +func CompareCALabels(a, b []string) bool { if len(a) != len(b) { return false } @@ -164,14 +165,15 @@ func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64 sort.Slice(newTrackedLabels, func(i, j int) bool { return newTrackedLabels[i] < newTrackedLabels[j] }) + // if they are different, we need to update the tracker, we don't mind, just reinitialized the tracker - if !compareStringSlice(cat.GetCALabels(), newTrackedLabels) { + if !CompareCALabels(cat.CALabels(), newTrackedLabels) { m.mtx.Lock() m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) // update the tracker with the new tracker cat = m.trackersByUserID[userID] m.mtx.Unlock() - } else if maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID); cat.GetMaxCardinality() != maxCardinality { + } else if maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID); cat.MaxCardinality() != maxCardinality { // if the maxCardinality is different, update the tracker cat.UpdateMaxCardinality(maxCardinality) } diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 78c774b2908..78a3f3fcf3a 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -81,22 +81,22 @@ func Test_CreateDeleteTracker(t *testing.T) { require.NoError(t, err) t.Run("Get tracker for user", func(t *testing.T) { - assert.NotNil(t, manager.TrackerForUser("user1").GetCALabels()) - assert.Equal(t, []string{"team"}, manager.TrackerForUser("user1").GetCALabels()) - assert.Equal(t, 5, manager.TrackerForUser("user1").GetMaxCardinality()) + assert.NotNil(t, manager.TrackerForUser("user1").CALabels()) + assert.Equal(t, []string{"team"}, manager.TrackerForUser("user1").CALabels()) + assert.Equal(t, 5, manager.TrackerForUser("user1").MaxCardinality()) // user2 is not enabled for cost attribution, so tracker would be nil tr2 := manager.TrackerForUser("user2") assert.Nil(t, tr2) - assert.Equal(t, []string(nil), tr2.GetCALabels()) + assert.Equal(t, []string(nil), tr2.CALabels()) - assert.Equal(t, []string{"department", "service"}, manager.TrackerForUser("user3").GetCALabels()) - assert.Equal(t, 2, manager.TrackerForUser("user3").GetMaxCardinality()) + assert.Equal(t, []string{"department", "service"}, manager.TrackerForUser("user3").CALabels()) + assert.Equal(t, 2, manager.TrackerForUser("user3").MaxCardinality()) // user4 tenant config doesn't exist, so tracker would be nil tr4 := manager.TrackerForUser("user4") assert.Nil(t, tr4) - assert.Equal(t, []string(nil), tr4.GetCALabels()) + assert.Equal(t, []string(nil), tr4.CALabels()) assert.Equal(t, 2, len(manager.trackersByUserID)) }) @@ -114,13 +114,13 @@ func Test_CreateDeleteTracker(t *testing.T) { expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",user="user1"} 1 - cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="__missing__",user="user3"} 1 + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="__missing__",tenant="user3",tracker="custom_attribution"} 1 # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",user="user3"} 1 - cortex_received_attributed_samples_total{department="foo",service="bar",user="user3"} 1 - cortex_received_attributed_samples_total{department="foo",service="dodo",user="user3"} 1 + cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",tenant="user3",tracker="custom_attribution"} 1 + cortex_received_attributed_samples_total{department="foo",service="bar",tenant="user3",tracker="custom_attribution"} 1 + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="custom_attribution"} 1 ` metricNames := []string{ "cortex_discarded_attributed_samples_total", @@ -136,12 +136,12 @@ func Test_CreateDeleteTracker(t *testing.T) { expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",user="user1"} 1 + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",user="user3"} 1 - cortex_received_attributed_samples_total{department="foo",service="bar",user="user3"} 1 - cortex_received_attributed_samples_total{department="foo",service="dodo",user="user3"} 1 + cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",tenant="user3",tracker="custom_attribution"} 1 + cortex_received_attributed_samples_total{department="foo",service="bar",tenant="user3",tracker="custom_attribution"} 1 + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="custom_attribution"} 1 ` metricNames := []string{ "cortex_discarded_attributed_samples_total", @@ -160,9 +160,9 @@ func Test_CreateDeleteTracker(t *testing.T) { expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",user="user3"} 1 - cortex_received_attributed_samples_total{department="foo",service="bar",user="user3"} 1 - cortex_received_attributed_samples_total{department="foo",service="dodo",user="user3"} 1 + cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",tenant="user3",tracker="custom_attribution"} 1 + cortex_received_attributed_samples_total{department="foo",service="bar",tenant="user3",tracker="custom_attribution"} 1 + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="custom_attribution"} 1 ` metricNames := []string{ "cortex_discarded_attributed_samples_total", @@ -182,13 +182,13 @@ func Test_CreateDeleteTracker(t *testing.T) { expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",user="user1"} 1 + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 ` metricNames := []string{ "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total", } - assert.Equal(t, 3, manager.TrackerForUser("user3").GetMaxCardinality()) + assert.Equal(t, 3, manager.TrackerForUser("user3").MaxCardinality()) assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) }) @@ -202,7 +202,7 @@ func Test_CreateDeleteTracker(t *testing.T) { expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",user="user1"} 2 + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 2 ` metricNames := []string{ "cortex_discarded_attributed_samples_total", diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index e9ac98a68ab..8446971a384 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -17,14 +17,19 @@ type Observation struct { lastUpdate *atomic.Int64 } -func (t *Tracker) GetCALabels() []string { +const ( + TrackerLabel = "tracker" + TenantLabel = "tenant" +) + +func (t *Tracker) CALabels() []string { if t == nil { return nil } return t.caLabels } -func (t *Tracker) GetMaxCardinality() int { +func (t *Tracker) MaxCardinality() int { if t == nil { return 0 } @@ -44,7 +49,7 @@ func (t *Tracker) cleanupTracker(userID string) { if t == nil { return } - filter := prometheus.Labels{"user": userID} + filter := prometheus.Labels{TenantLabel: userID} t.activeSeriesPerUserAttribution.DeletePartialMatch(filter) t.receivedSamplesAttribution.DeletePartialMatch(filter) t.discardedSampleAttribution.DeletePartialMatch(filter) @@ -90,7 +95,6 @@ func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now t.receivedSamplesAttribution.WithLabelValues(vals...).Add(value) } -// TODO: bug here, we can update values in the overflow, the reason is that when overflow, we need to change also the values for the overflow hash func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64, reason *string) []string { if t == nil { return nil @@ -168,19 +172,22 @@ func newTracker(userID string, trackedLabels []string, limit int) (*Tracker, err observed: map[uint64]*Observation{}, //lint:ignore faillint the metrics are registered in the mimir package discardedSampleAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_discarded_attributed_samples_total", - Help: "The total number of samples that were discarded per attribution.", - }, append(trackedLabels, "user", "reason")), + Name: "cortex_discarded_attributed_samples_total", + Help: "The total number of samples that were discarded per attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, + }, append(trackedLabels, TenantLabel, "reason")), //lint:ignore faillint the metrics are registered in the mimir package receivedSamplesAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_received_attributed_samples_total", - Help: "The total number of samples that were received per attribution.", - }, append(trackedLabels, "user")), + Name: "cortex_received_attributed_samples_total", + Help: "The total number of samples that were received per attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, + }, append(trackedLabels, TenantLabel)), //lint:ignore faillint the metrics are registered in the mimir package activeSeriesPerUserAttribution: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_ingester_attributed_active_series", - Help: "The total number of active series per user and attribution.", - }, append(trackedLabels, "user")), + Name: "cortex_ingester_attributed_active_series", + Help: "The total number of active series per user and attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, + }, append(trackedLabels, TenantLabel)), hashBuffer: make([]byte, 0, 1024), } m.updateOverFlowHash() diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index d35ec6e4f1c..14cab4f2f13 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -36,13 +36,13 @@ func Test_NewTracker(t *testing.T) { expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{platform="foo",reason="out-of-window", user="user1"} 2 + cortex_discarded_attributed_samples_total{platform="foo",reason="out-of-window", tenant="user1",tracker="custom_attribution"} 2 # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. # TYPE cortex_ingester_attributed_active_series gauge - cortex_ingester_attributed_active_series{platform="foo",user="user1"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user1",tracker="custom_attribution"} 1 # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{platform="foo",user="user1"} 5 + cortex_received_attributed_samples_total{platform="foo",tenant="user1",tracker="custom_attribution"} 5 ` metricNames := []string{ @@ -89,7 +89,7 @@ func Test_GetMaxCardinality(t *testing.T) { cat := newTestManager().TrackerForUser("user1") // Verify the max cardinality - assert.Equal(t, 5, cat.GetMaxCardinality()) + assert.Equal(t, 5, cat.MaxCardinality()) } func Test_GetCALabels(t *testing.T) { @@ -97,7 +97,7 @@ func Test_GetCALabels(t *testing.T) { cat := newTestManager().TrackerForUser("user1") // Verify the CA labels - assert.Equal(t, []string{"team"}, cat.GetCALabels()) + assert.Equal(t, []string{"team"}, cat.CALabels()) } func Test_UpdateMaxCardinality(t *testing.T) { @@ -108,5 +108,5 @@ func Test_UpdateMaxCardinality(t *testing.T) { cat.UpdateMaxCardinality(20) // Verify the max cardinality - assert.Equal(t, 20, cat.GetMaxCardinality()) + assert.Equal(t, 20, cat.MaxCardinality()) } diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 9ee8f4909d3..40e49bd911e 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -111,27 +111,9 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { return c.matchers.MatcherNames() } -// Function to compare two Tracker instances -func areTrackersEqual(t1, t2 *costattribution.Tracker) bool { - cal1 := t1.GetCALabels() - cal2 := t2.GetCALabels() - if len(cal1) != len(cal2) { - return false - } - for i := range cal1 { - if cal1[i] != cal2[i] { - return false - } - } - return true -} - func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { - if ctCfg.String() != c.CurrentConfig().String() { - return true - } - - return !areTrackersEqual(caCfg, c.CurrentCostAttributionTracker()) + currentCTC, currentCAT := c.CurrentConfig() + return ctCfg.String() != currentCTC.String() || !costattribution.CompareCALabels(caCfg.CALabels(), currentCAT.CALabels()) } func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { @@ -145,16 +127,10 @@ func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { c.lastConfigUpdate = now } -func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { - c.configMutex.RLock() - defer c.configMutex.RUnlock() - return c.matchers.Config() -} - -func (c *ActiveSeries) CurrentCostAttributionTracker() *costattribution.Tracker { +func (c *ActiveSeries) CurrentConfig() (asmodel.CustomTrackersConfig, *costattribution.Tracker) { c.configMutex.RLock() defer c.configMutex.RUnlock() - return c.cat + return c.matchers.Config(), c.cat } // UpdateSeries updates series timestamp to 'now'. Function is called to make a copy of labels if entry doesn't exist yet. From da6b00b043e9422a8d4ae54fcb287d968dc8ab8a Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 7 Nov 2024 13:07:51 +0100 Subject: [PATCH 13/32] add logic for overflow --- pkg/costattribution/manager.go | 14 +++++ pkg/costattribution/manager_test.go | 15 ++++-- pkg/costattribution/tracker.go | 60 +++++----------------- pkg/ingester/activeseries/active_series.go | 3 +- 4 files changed, 41 insertions(+), 51 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 9bf878dbbe6..290eeab2a63 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -129,11 +129,25 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) { m.deleteUserTracer(userID) continue } + // get all inactive attributions for the user and clean up the tracker inactiveObs := m.purgeInactiveObservationsForUser(userID, deadline) for _, ob := range inactiveObs { m.trackersByUserID[userID].cleanupTrackerAttribution(ob.lvalues) } + + // if the tracker is no longer overflowed, and it is currently in overflow state, check the cooldown and create new tracker + cat := m.TrackerForUser(userID) + if cat != nil && cat.isOverflow { + if len(cat.observed) < cat.MaxCardinality() { + if cat.cooldownUntil.Load() < deadline { + m.deleteUserTracer(userID) + continue + } + } else { + cat.cooldownUntil.Store(deadline + cat.cooldownDuration) + } + } } } diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 78a3f3fcf3a..6f82cf10ee8 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -111,6 +111,12 @@ func Test_CreateDeleteTracker(t *testing.T) { manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "dodo"}...), 1, time.Unix(20, 0)) manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "bar"}...), 1, time.Unix(30, 0)) manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "far"}...), 1, time.Unix(30, 0)) + + cat := manager.TrackerForUser("user3") + assert.True(t, cat.isOverflow) + // this number is the timestamp of when overflow was set + 20 minutes in seconds. 20 * 60 + 30 = 1230 + assert.Equal(t, int64(1230), cat.cooldownUntil.Load()) + expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter @@ -157,6 +163,7 @@ func Test_CreateDeleteTracker(t *testing.T) { manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) assert.Equal(t, 1, len(manager.trackersByUserID)) + expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter @@ -171,13 +178,13 @@ func Test_CreateDeleteTracker(t *testing.T) { assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) }) - t.Run("Increase user cost attribution max cardinality, since current state is overflow, we clean up the counter", func(t *testing.T) { + t.Run("Increase user cost attribution max cardinality, since current state is overflow and cooldown is passed, we recreate tracker", func(t *testing.T) { // user3 has cost attribution labels department and service, we change it to team and feature. user1 should not be affected manager.limits, err = getMockLimits(3) assert.NoError(t, err) - manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, "invalid-metrics-name", time.Unix(12, 0)) - manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) - assert.Equal(t, 2, len(manager.trackersByUserID)) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, "invalid-metrics-name", time.Unix(1251, 0)) + manager.purgeInactiveAttributionsUntil(1250) + assert.Equal(t, 1, len(manager.trackersByUserID)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 8446971a384..854ae165fd1 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -67,8 +67,10 @@ type Tracker struct { obseveredMtx sync.RWMutex observed map[uint64]*Observation - hashBuffer []byte - overflowHash uint64 + hashBuffer []byte + isOverflow bool + cooldownUntil *atomic.Int64 + cooldownDuration int64 } func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { @@ -129,12 +131,16 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { if t == nil { return false } + // if the tracker is already in overflow state, we don't need to check the cardinality + if t.isOverflow { + return true + } // If the maximum cardinality is hit all streams become `__overflow__`, the function would return true. // the origin labels ovserved time is not updated, but the overflow hash is updated. - isOverflow := false if len(t.observed) > t.maxCardinality { - isOverflow = true - stream = t.overflowHash + t.isOverflow = true + t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) + return true } if o, known := t.observed[stream]; known && o.lastUpdate != nil && o.lastUpdate.Load() < ts { @@ -146,7 +152,7 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { } } - return isOverflow + return false } // we need the time stamp, since active series could have entered active stripe long time ago, and already evicted @@ -188,24 +194,12 @@ func newTracker(userID string, trackedLabels []string, limit int) (*Tracker, err Help: "The total number of active series per user and attribution.", ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, }, append(trackedLabels, TenantLabel)), - hashBuffer: make([]byte, 0, 1024), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: int64((time.Minute * 20).Seconds()), } - m.updateOverFlowHash() return m, nil } -func (t *Tracker) updateOverFlowHash() { - if t == nil { - return - } - b := labels.NewScratchBuilder(len(t.caLabels)) - for _, lb := range t.caLabels { - b.Add(lb, overflowValue) - } - b.Sort() - t.overflowHash = b.Labels().Hash() -} - func (t *Tracker) Collect(out chan<- prometheus.Metric) { if t == nil { return @@ -223,35 +217,10 @@ func (t *Tracker) Describe(chan<- *prometheus.Desc) { } } -// resetObservedIfNeeded checks if the overflow hash is in the observed map and if it is, when dealine is 0, means that -// we just need to clean up the observed map and metrics without checking the deadline. -// Otherwise, we need to check if the last update time of the overflow hash is less than or equal to the deadline. -// return true if the observed map is cleaned up, otherwise false. -func (t *Tracker) resetObservedIfNeeded(deadline int64) bool { - if t == nil { - return false - } - t.obseveredMtx.Lock() - defer t.obseveredMtx.Unlock() - if ob, ok := t.observed[t.overflowHash]; ok { - if deadline == 0 || (ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline) { - t.observed = map[uint64]*Observation{} - t.cleanupTracker(t.userID) - return true - } - } - return false -} - func (t *Tracker) PurgeInactiveObservations(deadline int64) []*Observation { if t == nil { return nil } - // if overflow is in the observed map and it is reached dealine, we need to clean up the observed map and metrics - isReset := t.resetObservedIfNeeded(deadline) - if isReset { - return []*Observation{} - } // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []uint64 @@ -297,6 +266,5 @@ func (t *Tracker) UpdateMaxCardinality(limit int) { } // if we have hit the limit, we need to clear the observed map. The way to tell that we have hit the limit is // by checking if the overflow hash is in the observed map. This is handled in the resetObservedIfNeeded function. 0 here means no deadline check is needed. - t.resetObservedIfNeeded(0) t.maxCardinality = limit } diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 40e49bd911e..d8763053647 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -113,7 +113,8 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { currentCTC, currentCAT := c.CurrentConfig() - return ctCfg.String() != currentCTC.String() || !costattribution.CompareCALabels(caCfg.CALabels(), currentCAT.CALabels()) + // TODO: I think here to check the pointer is not equal is already enough, if we recreate tracker, it is for a good reason, otherwise, nothing changed + return ctCfg.String() != currentCTC.String() || caCfg != currentCAT //|| !costattribution.CompareCALabels(caCfg.CALabels(), currentCAT.CALabels()) } func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { From 2b5e3ff74bb1950aadd44c59e11f4ad1cffe9e9c Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 8 Nov 2024 00:23:13 +0100 Subject: [PATCH 14/32] improve tests for cost attribution service --- pkg/costattribution/manager_test.go | 199 ++++++++++++---------------- pkg/costattribution/tracker.go | 181 ++++++++++++------------- pkg/costattribution/tracker_test.go | 148 +++++++++++++-------- 3 files changed, 264 insertions(+), 264 deletions(-) diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 6f82cf10ee8..76e32ac89ba 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -18,23 +18,13 @@ import ( ) func getMockLimits(idx int) (*validation.Overrides, error) { - // Define base limits baseLimits := map[string]*validation.Limits{ - "user1": { - MaxCostAttributionCardinalityPerUser: 5, - CostAttributionLabels: []string{"team"}, - }, - "user2": { - MaxCostAttributionCardinalityPerUser: 2, - CostAttributionLabels: []string{}, - }, - "user3": { - MaxCostAttributionCardinalityPerUser: 2, - CostAttributionLabels: []string{"department", "service"}, - }, + "user1": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"team"}}, + "user2": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{}}, + "user3": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{"department", "service"}}, + "user4": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"platform"}}, } - // Adjust specific cases as needed switch idx { case 1: baseLimits["user1"].CostAttributionLabels = []string{} @@ -54,177 +44,154 @@ func getMockLimits(idx int) (*validation.Overrides, error) { func newTestManager() *Manager { logger := log.NewNopLogger() limits, _ := getMockLimits(0) - inactiveTimeout := 10 * time.Second - cleanupInterval := 5 * time.Second - return NewManager(cleanupInterval, inactiveTimeout, logger, limits) + return NewManager(5*time.Second, 10*time.Second, logger, limits) } func Test_NewManager(t *testing.T) { manager := newTestManager() - assert.NotNil(t, manager, "Expected manager to be initialized") - assert.NotNil(t, manager.trackersByUserID, "Expected attribution tracker to be initialized") - assert.Equal(t, 10*time.Second, manager.inactiveTimeout, "Expected inactiveTimeout to be initialized") + assert.NotNil(t, manager) + assert.NotNil(t, manager.trackersByUserID) + assert.Equal(t, 10*time.Second, manager.inactiveTimeout) } func Test_EnabledForUser(t *testing.T) { manager := newTestManager() assert.True(t, manager.EnabledForUser("user1"), "Expected cost attribution to be enabled for user1") assert.False(t, manager.EnabledForUser("user2"), "Expected cost attribution to be disabled for user2") - assert.False(t, manager.EnabledForUser("user4"), "Expected cost attribution to be disabled for user4") + assert.False(t, manager.EnabledForUser("user5"), "Expected cost attribution to be disabled for user5") } func Test_CreateDeleteTracker(t *testing.T) { - // Create a new manager and register it with prometheus registry manager := newTestManager() reg := prometheus.NewRegistry() - err := reg.Register(manager) - require.NoError(t, err) + require.NoError(t, reg.Register(manager)) - t.Run("Get tracker for user", func(t *testing.T) { - assert.NotNil(t, manager.TrackerForUser("user1").CALabels()) - assert.Equal(t, []string{"team"}, manager.TrackerForUser("user1").CALabels()) - assert.Equal(t, 5, manager.TrackerForUser("user1").MaxCardinality()) + t.Run("Tracker existence and attributes", func(t *testing.T) { + user1Tracker := manager.TrackerForUser("user1") + assert.NotNil(t, user1Tracker) + assert.Equal(t, []string{"team"}, user1Tracker.CALabels()) + assert.Equal(t, 5, user1Tracker.MaxCardinality()) - // user2 is not enabled for cost attribution, so tracker would be nil - tr2 := manager.TrackerForUser("user2") - assert.Nil(t, tr2) - assert.Equal(t, []string(nil), tr2.CALabels()) + assert.Nil(t, manager.TrackerForUser("user2")) - assert.Equal(t, []string{"department", "service"}, manager.TrackerForUser("user3").CALabels()) - assert.Equal(t, 2, manager.TrackerForUser("user3").MaxCardinality()) - - // user4 tenant config doesn't exist, so tracker would be nil - tr4 := manager.TrackerForUser("user4") - assert.Nil(t, tr4) - assert.Equal(t, []string(nil), tr4.CALabels()) - - assert.Equal(t, 2, len(manager.trackersByUserID)) + user3Tracker := manager.TrackerForUser("user3") + assert.NotNil(t, user3Tracker) + assert.Equal(t, []string{"department", "service"}, user3Tracker.CALabels()) + assert.Equal(t, 2, user3Tracker.MaxCardinality()) }) - t.Run("Track metrics for enabled user", func(t *testing.T) { - // since user2 is not enabled for cost attribution, tracker would be nil, no metrics would be tracked - manager.TrackerForUser("user2").IncrementReceivedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, time.Unix(0, 0)) - - // user1 and user3 is enabled for cost attribution, so metrics would be tracked - manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, "invalid-metrics-name", time.Unix(12, 0)) - manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings([]string{"department", "foo"}...), 1, "out-of-window", time.Unix(0, 0)) - manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "dodo"}...), 1, time.Unix(20, 0)) - manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "bar"}...), 1, time.Unix(30, 0)) - manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings([]string{"department", "foo", "service", "far"}...), 1, time.Unix(30, 0)) - - cat := manager.TrackerForUser("user3") - assert.True(t, cat.isOverflow) - // this number is the timestamp of when overflow was set + 20 minutes in seconds. 20 * 60 + 30 = 1230 - assert.Equal(t, int64(1230), cat.cooldownUntil.Load()) + t.Run("Metrics tracking", func(t *testing.T) { + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("department", "foo", "service", "dodo"), 1, time.Unix(20, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 - cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="__missing__",tenant="user3",tracker="custom_attribution"} 1 # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",tenant="user3",tracker="custom_attribution"} 1 - cortex_received_attributed_samples_total{department="foo",service="bar",tenant="user3",tracker="custom_attribution"} 1 cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="custom_attribution"} 1 ` - metricNames := []string{ - "cortex_discarded_attributed_samples_total", - "cortex_received_attributed_samples_total", - } - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) }) t.Run("Purge inactive attributions", func(t *testing.T) { - // Purge inactive attributions until time 10, metrics cortex_discarded_attributed_samples_total of user3 should be deleted manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) - assert.Equal(t, 2, len(manager.trackersByUserID)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. - # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",tenant="user3",tracker="custom_attribution"} 1 - cortex_received_attributed_samples_total{department="foo",service="bar",tenant="user3",tracker="custom_attribution"} 1 - cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="custom_attribution"} 1 + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 ` - metricNames := []string{ - "cortex_discarded_attributed_samples_total", - "cortex_received_attributed_samples_total", - } - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) }) - t.Run("Disable user cost attribution, tracker and metrics are removed", func(t *testing.T) { - // We disable cost attribution for user1, so the tracker should be deleted - manager.limits, err = getMockLimits(1) - assert.NoError(t, err) - + t.Run("Disabling user cost attribution", func(t *testing.T) { + manager.limits, _ = getMockLimits(1) manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) assert.Equal(t, 1, len(manager.trackersByUserID)) expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{department="__overflow__",service="__overflow__",tenant="user3",tracker="custom_attribution"} 1 - cortex_received_attributed_samples_total{department="foo",service="bar",tenant="user3",tracker="custom_attribution"} 1 cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="custom_attribution"} 1 ` - metricNames := []string{ - "cortex_discarded_attributed_samples_total", - "cortex_received_attributed_samples_total", - } - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) }) - t.Run("Increase user cost attribution max cardinality, since current state is overflow and cooldown is passed, we recreate tracker", func(t *testing.T) { - // user3 has cost attribution labels department and service, we change it to team and feature. user1 should not be affected - manager.limits, err = getMockLimits(3) - assert.NoError(t, err) - manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, "invalid-metrics-name", time.Unix(1251, 0)) - manager.purgeInactiveAttributionsUntil(1250) + t.Run("Updating user cardinality and labels", func(t *testing.T) { + manager.limits, _ = getMockLimits(2) + manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix()) + // user3 tracker should be recreated with cost attribution labels changed to ["team", "feature"] assert.Equal(t, 1, len(manager.trackersByUserID)) + assert.Equal(t, []string{"feature", "team"}, manager.TrackerForUser("user3").CALabels()) + + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{feature="__missing__",reason="invalid-metrics-name",team="foo",tenant="user3",tracker="custom_attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) +} + +func Test_PurgeInactiveAttributionsUntil(t *testing.T) { + manager := newTestManager() + reg := prometheus.NewRegistry() + require.NoError(t, reg.Register(manager)) + + // Simulate metrics for multiple users to set up initial state + manager.TrackerForUser("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(1, 0)) + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) + + t.Run("Purge before inactive timeout", func(t *testing.T) { + // Run purge at a timestamp that doesn't exceed inactive timeout + manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix()) + + // No purging should have occurred, track user metrics remain + assert.Equal(t, 2, len(manager.trackersByUserID), "Expected trackers to remain active before timeout") expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. - # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="custom_attribution"} 1 ` metricNames := []string{ "cortex_discarded_attributed_samples_total", - "cortex_received_attributed_samples_total", } - assert.Equal(t, 3, manager.TrackerForUser("user3").MaxCardinality()) assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) }) - t.Run("Increase user cost attribution max cardinality, user is not in overflow, nothing changed", func(t *testing.T) { - // user3 has cost attribution labels department and service, we change it to team and feature - manager.limits, err = getMockLimits(4) - assert.NoError(t, err) - manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings([]string{"team", "foo"}...), 1, "invalid-metrics-name", time.Unix(13, 0)) - manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) - assert.Equal(t, 2, len(manager.trackersByUserID)) + t.Run("Purge after inactive timeout", func(t *testing.T) { + // disable cost attribution for user1 to test purging + manager.limits, _ = getMockLimits(1) + manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix()) + + // User3's tracker should remain since it's active, user1's tracker should be removed + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") + assert.Nil(t, manager.TrackerForUser("user1"), "Expected user1 tracker to be purged") + expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. - # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 2 + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="custom_attribution"} 1 ` metricNames := []string{ "cortex_discarded_attributed_samples_total", - "cortex_received_attributed_samples_total", } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) }) - t.Run("Change user cost attribution lables, tracker and metrics are reinitialized", func(t *testing.T) { - // user3 has cost attribution labels department and service, we change it to team and feature - manager.limits, err = getMockLimits(5) - assert.NoError(t, err) + t.Run("Purge all trackers", func(t *testing.T) { + // Trigger a purge that should remove all inactive trackers + manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix()) - manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) - assert.Equal(t, 2, len(manager.trackersByUserID)) + // Tracker would stay at 1 since user1's tracker is disabled + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") + + // No metrics should remain after all purged metricNames := []string{ "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total", diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 854ae165fd1..6898b7c6ee2 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -22,6 +22,59 @@ const ( TenantLabel = "tenant" ) +type Tracker struct { + userID string + caLabels []string + maxCardinality int + activeSeriesPerUserAttribution *prometheus.GaugeVec + receivedSamplesAttribution *prometheus.CounterVec + discardedSampleAttribution *prometheus.CounterVec + + // obseveredMtx protects the observed map + obseveredMtx sync.RWMutex + observed map[uint64]*Observation + + hashBuffer []byte + isOverflow bool + cooldownUntil *atomic.Int64 + cooldownDuration int64 +} + +func newTracker(userID string, trackedLabels []string, limit int) (*Tracker, error) { + // keep tracked labels sorted for consistent metric labels + sort.Slice(trackedLabels, func(i, j int) bool { + return trackedLabels[i] < trackedLabels[j] + }) + m := &Tracker{ + userID: userID, + caLabels: trackedLabels, + maxCardinality: limit, + obseveredMtx: sync.RWMutex{}, + observed: map[uint64]*Observation{}, + //lint:ignore faillint the metrics are registered in the mimir package + discardedSampleAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_discarded_attributed_samples_total", + Help: "The total number of samples that were discarded per attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, + }, append(trackedLabels, TenantLabel, "reason")), + //lint:ignore faillint the metrics are registered in the mimir package + receivedSamplesAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_received_attributed_samples_total", + Help: "The total number of samples that were received per attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, + }, append(trackedLabels, TenantLabel)), + //lint:ignore faillint the metrics are registered in the mimir package + activeSeriesPerUserAttribution: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_ingester_attributed_active_series", + Help: "The total number of active series per user and attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, + }, append(trackedLabels, TenantLabel)), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: int64((time.Minute * 20).Seconds()), + } + return m, nil +} + func (t *Tracker) CALabels() []string { if t == nil { return nil @@ -55,24 +108,6 @@ func (t *Tracker) cleanupTracker(userID string) { t.discardedSampleAttribution.DeletePartialMatch(filter) } -type Tracker struct { - userID string - caLabels []string - maxCardinality int - activeSeriesPerUserAttribution *prometheus.GaugeVec - receivedSamplesAttribution *prometheus.CounterVec - discardedSampleAttribution *prometheus.CounterVec - - // obseveredMtx protects the observed map - obseveredMtx sync.RWMutex - observed map[uint64]*Observation - - hashBuffer []byte - isOverflow bool - cooldownUntil *atomic.Int64 - cooldownDuration int64 -} - func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { if t == nil { return @@ -81,6 +116,14 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() } +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } + vals := t.getKeyValues(lbs, now.Unix(), nil) + t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() +} + func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { if t == nil { return @@ -97,6 +140,23 @@ func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now t.receivedSamplesAttribution.WithLabelValues(vals...).Add(value) } +func (t *Tracker) Collect(out chan<- prometheus.Metric) { + if t == nil { + return + } + t.activeSeriesPerUserAttribution.Collect(out) + t.receivedSamplesAttribution.Collect(out) + t.discardedSampleAttribution.Collect(out) +} + +// Describe implements prometheus.Collector. +func (t *Tracker) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector + if t == nil { + return + } +} + func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64, reason *string) []string { if t == nil { return nil @@ -131,90 +191,26 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { if t == nil { return false } - // if the tracker is already in overflow state, we don't need to check the cardinality - if t.isOverflow { - return true - } + // If the maximum cardinality is hit all streams become `__overflow__`, the function would return true. // the origin labels ovserved time is not updated, but the overflow hash is updated. if len(t.observed) > t.maxCardinality { t.isOverflow = true t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) - return true } if o, known := t.observed[stream]; known && o.lastUpdate != nil && o.lastUpdate.Load() < ts { o.lastUpdate.Store(ts) } else { - t.observed[stream] = &Observation{ - lvalues: values, - lastUpdate: atomic.NewInt64(ts), + if !t.isOverflow { + t.observed[stream] = &Observation{ + lvalues: values, + lastUpdate: atomic.NewInt64(ts), + } } } - return false -} - -// we need the time stamp, since active series could have entered active stripe long time ago, and already evicted -// from the observed map but still in the active Stripe -func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, ts time.Time) { - if t == nil { - return - } - vals := t.getKeyValues(lbs, ts.Unix(), nil) - t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() -} - -func newTracker(userID string, trackedLabels []string, limit int) (*Tracker, error) { - // keep tracked labels sorted for consistent metric labels - sort.Slice(trackedLabels, func(i, j int) bool { - return trackedLabels[i] < trackedLabels[j] - }) - m := &Tracker{ - userID: userID, - caLabels: trackedLabels, - maxCardinality: limit, - obseveredMtx: sync.RWMutex{}, - observed: map[uint64]*Observation{}, - //lint:ignore faillint the metrics are registered in the mimir package - discardedSampleAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_discarded_attributed_samples_total", - Help: "The total number of samples that were discarded per attribution.", - ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, - }, append(trackedLabels, TenantLabel, "reason")), - //lint:ignore faillint the metrics are registered in the mimir package - receivedSamplesAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_received_attributed_samples_total", - Help: "The total number of samples that were received per attribution.", - ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, - }, append(trackedLabels, TenantLabel)), - //lint:ignore faillint the metrics are registered in the mimir package - activeSeriesPerUserAttribution: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_ingester_attributed_active_series", - Help: "The total number of active series per user and attribution.", - ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, - }, append(trackedLabels, TenantLabel)), - hashBuffer: make([]byte, 0, 1024), - cooldownDuration: int64((time.Minute * 20).Seconds()), - } - return m, nil -} - -func (t *Tracker) Collect(out chan<- prometheus.Metric) { - if t == nil { - return - } - t.activeSeriesPerUserAttribution.Collect(out) - t.receivedSamplesAttribution.Collect(out) - t.discardedSampleAttribution.Collect(out) -} - -// Describe implements prometheus.Collector. -func (t *Tracker) Describe(chan<- *prometheus.Desc) { - // this is an unchecked collector - if t == nil { - return - } + return t.isOverflow } func (t *Tracker) PurgeInactiveObservations(deadline int64) []*Observation { @@ -259,12 +255,5 @@ func (t *Tracker) UpdateMaxCardinality(limit int) { if t == nil { return } - // if we are reducing limit, we can just set it, if it hits the limit, we can't do much about it. - if t.maxCardinality >= limit { - t.maxCardinality = limit - return - } - // if we have hit the limit, we need to clear the observed map. The way to tell that we have hit the limit is - // by checking if the overflow hash is in the observed map. This is handled in the resetObservedIfNeeded function. 0 here means no deadline check is needed. t.maxCardinality = limit } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 14cab4f2f13..1b8f7c6995f 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -14,35 +14,47 @@ import ( "github.com/stretchr/testify/require" ) -func Test_NewTracker(t *testing.T) { +func Test_GetCALabels(t *testing.T) { + cat := newTestManager().TrackerForUser("user1") + assert.Equal(t, []string{"team"}, cat.CALabels(), "Expected cost attribution labels mismatch") +} - // Setup the test environment - reg := prometheus.NewRegistry() - trackedLabel := []string{"platform"} - cat, err := newTracker("user1", trackedLabel, 5) - require.NoError(t, err) +func Test_GetMaxCardinality(t *testing.T) { + cat := newTestManager().TrackerForUser("user1") + assert.Equal(t, 5, cat.MaxCardinality(), "Expected max cardinality mismatch") +} - // Register the metrics - err = reg.Register(cat) +func Test_CreateCleanupTracker(t *testing.T) { + // Setup the test environment for the user4, user4 has cost attribution labels "platform", max cardinality limit is 5 + cat := newTestManager().TrackerForUser("user4") + + reg := prometheus.NewRegistry() + err := reg.Register(cat) require.NoError(t, err) // Simulate some values in the metrics - vals := []string{"foo", "user1"} - cat.activeSeriesPerUserAttribution.WithLabelValues(vals...).Set(1.0) - cat.receivedSamplesAttribution.WithLabelValues(vals...).Add(5) - cat.discardedSampleAttribution.WithLabelValues(append(vals, "out-of-window")...).Add(2) + // platform="foo" tenant="user1" team="..." + cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) + cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(1, 0)) + cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3"), time.Unix(1, 0)) + cat.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(1, 0)) + cat.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(1, 0)) + + // platform="bar" tenant="user1" team="..." + cat.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(1, 0)) // Verify the metrics expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{platform="foo",reason="out-of-window", tenant="user1",tracker="custom_attribution"} 2 + cortex_discarded_attributed_samples_total{platform="foo",reason="sample-out-of-order", tenant="user4",tracker="custom_attribution"} 2 # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. # TYPE cortex_ingester_attributed_active_series gauge - cortex_ingester_attributed_active_series{platform="foo",tenant="user1",tracker="custom_attribution"} 1 + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="custom_attribution"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="custom_attribution"} 1 # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{platform="foo",tenant="user1",tracker="custom_attribution"} 5 + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="custom_attribution"} 5 ` metricNames := []string{ @@ -52,61 +64,93 @@ func Test_NewTracker(t *testing.T) { } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) - // Clean up the metrics - cat.cleanupTrackerAttribution(vals) + // Clean up the metrics with label values platform="foo" tenant="user1" + cat.cleanupTrackerAttribution([]string{"foo", "user4"}) + cat.cleanupTrackerAttribution([]string{"foo", "user4", "sample-out-of-order"}) + + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="custom_attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + + // Clean up the metrics with label values tenant="user1" + cat.cleanupTracker("user4") + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) +} + +func Test_GetKeyValues(t *testing.T) { + cat := newTestManager().TrackerForUser("user3") + + // Test initial key values and overflow states + keyVal1 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar"), 1, nil) + assert.Equal(t, []string{"foo", "bar", "user3"}, keyVal1, "First call, expecting values as-is") + + keyVal2 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "baz"), 2, nil) + assert.Equal(t, []string{"foo", "baz", "user3"}, keyVal2, "Second call, expecting values as-is") + + reason := "sample out of order" + keyVal3 := cat.getKeyValues(labels.FromStrings("department", "foo"), 3, &reason) + assert.Equal(t, []string{"foo", "__missing__", "user3", "sample out of order"}, keyVal3, "Service missing, should return '__missing__'") + + keyVal4 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar", "team", "a"), 4, nil) + assert.Equal(t, []string{"__overflow__", "__overflow__", "user3"}, keyVal4, "Overflow state expected") +} + +func Test_Overflow(t *testing.T) { + cat := newTestManager().TrackerForUser("user3") + lbls1 := labels.FromStrings("department", "foo", "service", "bar") + lbls2 := labels.FromStrings("department", "bar", "service", "baz") + lbls3 := labels.FromStrings("department", "baz", "service", "foo") + + var buf []byte + stream1, _ := lbls1.HashForLabels(buf, cat.caLabels...) + stream2, _ := lbls2.HashForLabels(buf, cat.caLabels...) + stream3, _ := lbls3.HashForLabels(buf, cat.caLabels...) + + assert.False(t, cat.overflow(stream1, []string{"foo", "bar", "user1"}, 1), "First observation, should not overflow") + assert.False(t, cat.overflow(stream2, []string{"bar", "baz", "user1"}, 2), "Second observation, should not overflow") + assert.False(t, cat.overflow(stream3, []string{"baz", "foo", "user1"}, 3), "Third observation, should not overflow") + assert.True(t, cat.overflow(stream3, []string{"baz", "foo", "user1"}, 4), "Fourth observation, should overflow") + assert.Equal(t, int64(4+cat.cooldownDuration), cat.cooldownUntil.Load(), "CooldownUntil should be updated correctly") } func Test_PurgeInactiveObservations(t *testing.T) { - // Setup the test environment, user1 cost attribution label is "team", max cardinality limit is 5 + // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. cat := newTestManager().TrackerForUser("user1") - // create 2 observations - lbs := []labels.Labels{ - labels.FromStrings([]string{"team", "foo"}...), - labels.FromStrings([]string{"team", "bar"}...), + // Create two observations with different last update timestamps. + observations := []labels.Labels{ + labels.FromStrings("team", "foo"), + labels.FromStrings("team", "bar"), } - cat.IncrementDiscardedSamples(lbs[0], 1, "invalid-metrics-name", time.Unix(1, 0)) - cat.IncrementDiscardedSamples(lbs[1], 2, "out-of-window-sample", time.Unix(12, 0)) + // Simulate samples discarded with different timestamps. + cat.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) + cat.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) - // Check the observations + // Ensure that two observations were successfully added to the tracker. require.Len(t, cat.observed, 2) - // Purge the observations older than 10 seconds, we should have 1 observation left + // Purge observations that haven't been updated in the last 10 seconds. purged := cat.PurgeInactiveObservations(10) - // Verify the purged observations + // Verify that only one observation was purged. require.Len(t, purged, 1) + + // Check that the purged observation matches the expected details. assert.Equal(t, int64(1), purged[0].lastUpdate.Load()) assert.Equal(t, []string{"foo", "user1", "invalid-metrics-name"}, purged[0].lvalues) - // Verify the remaining observations + // Verify that only one observation remains in the tracker. Confirm that the remaining observation has the correct last update timestamp. require.Len(t, cat.observed, 1) - assert.NotNil(t, cat.observed[lbs[1].Hash()].lastUpdate) -} - -func Test_GetMaxCardinality(t *testing.T) { - // Setup the test environment - cat := newTestManager().TrackerForUser("user1") - - // Verify the max cardinality - assert.Equal(t, 5, cat.MaxCardinality()) -} - -func Test_GetCALabels(t *testing.T) { - // Setup the test environment - cat := newTestManager().TrackerForUser("user1") - - // Verify the CA labels - assert.Equal(t, []string{"team"}, cat.CALabels()) + assert.NotNil(t, cat.observed[observations[1].Hash()].lastUpdate) + assert.Equal(t, int64(12), cat.observed[observations[1].Hash()].lastUpdate.Load()) } func Test_UpdateMaxCardinality(t *testing.T) { - // Setup the test environment + // user1 original max cardinality is 5 cat := newTestManager().TrackerForUser("user1") - - // Update max cardinality - cat.UpdateMaxCardinality(20) - - // Verify the max cardinality - assert.Equal(t, 20, cat.MaxCardinality()) + cat.UpdateMaxCardinality(2) + assert.Equal(t, 2, cat.MaxCardinality(), "Expected max cardinality update to 2") } From cb2a2b699f05838a9bdb63f2e23cc5ee385241d1 Mon Sep 17 00:00:00 2001 From: "Grot (@grafanabot)" <43478413+grafanabot@users.noreply.github.com> Date: Mon, 18 Nov 2024 04:17:02 +0200 Subject: [PATCH 15/32] Don't hold labels from store-gateways in two forms, and don't convert them multiple times (#9914) (#9930) * Don't hold labels from store-gateways in two forms * Don't retain labels longer than needed * Don't convert mimirpb.LabelAdaptors to labels.Labels multiple times * Add changelog entry (cherry picked from commit d2367de16a5623e422f802b1e89826051e3cc6f2) Co-authored-by: Charles Korn --- CHANGELOG.md | 1 + pkg/distributor/distributor.go | 2 +- pkg/distributor/query.go | 10 +++++---- pkg/querier/block_streaming.go | 13 ++++++----- pkg/querier/block_streaming_test.go | 5 +---- pkg/querier/blocks_store_queryable.go | 30 +++++++++++++++----------- pkg/util/limiter/query_limiter.go | 6 +++--- pkg/util/limiter/query_limiter_test.go | 17 +++++++-------- 8 files changed, 46 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a1389c5a2bf..c4adf5db7f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,7 @@ * [ENHANCEMENT] PromQL: make `sort_by_label` stable. #9879 * [ENHANCEMENT] Distributor: Initialize ha_tracker cache before ha_tracker and distributor reach running state and begin serving writes. #9826 * [ENHANCEMENT] Ingester: `-ingest-storage.kafka.max-buffered-bytes` to limit the memory for buffered records when using concurrent fetching. #9892 +* [ENHANCEMENT] Querier: improve performance and memory consumption of queries that select many series. #9914 * [BUGFIX] Fix issue where functions such as `rate()` over native histograms could return incorrect values if a float stale marker was present in the selected range. #9508 * [BUGFIX] Fix issue where negation of native histograms (eg. `-some_native_histogram_series`) did nothing. #9508 * [BUGFIX] Fix issue where `metric might not be a counter, name does not end in _total/_sum/_count/_bucket` annotation would be emitted even if `rate` or `increase` did not have enough samples to compute a result. #9508 diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 7ca90a4185e..fe475d5ed09 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -2557,7 +2557,7 @@ func (d *Distributor) MetricsForLabelMatchers(ctx context.Context, from, through result := make([]labels.Labels, 0, len(metrics)) for _, m := range metrics { - if err := queryLimiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(m)); err != nil { + if err := queryLimiter.AddSeries(m); err != nil { return nil, err } result = append(result, m) diff --git a/pkg/distributor/query.go b/pkg/distributor/query.go index 73610f6e089..ee381656840 100644 --- a/pkg/distributor/query.go +++ b/pkg/distributor/query.go @@ -268,7 +268,7 @@ func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSets [ if len(resp.Timeseries) > 0 { for _, series := range resp.Timeseries { - if limitErr := queryLimiter.AddSeries(series.Labels); limitErr != nil { + if limitErr := queryLimiter.AddSeries(mimirpb.FromLabelAdaptersToLabels(series.Labels)); limitErr != nil { return ingesterQueryResult{}, limitErr } } @@ -285,7 +285,7 @@ func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSets [ } for _, series := range resp.Chunkseries { - if err := queryLimiter.AddSeries(series.Labels); err != nil { + if err := queryLimiter.AddSeries(mimirpb.FromLabelAdaptersToLabels(series.Labels)); err != nil { return ingesterQueryResult{}, err } } @@ -300,7 +300,9 @@ func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSets [ streamingSeriesCount += len(resp.StreamingSeries) for _, s := range resp.StreamingSeries { - if err := queryLimiter.AddSeries(s.Labels); err != nil { + l := mimirpb.FromLabelAdaptersToLabels(s.Labels) + + if err := queryLimiter.AddSeries(l); err != nil { return ingesterQueryResult{}, err } @@ -313,7 +315,7 @@ func (d *Distributor) queryIngesterStream(ctx context.Context, replicationSets [ return ingesterQueryResult{}, err } - labelsBatch = append(labelsBatch, mimirpb.FromLabelAdaptersToLabels(s.Labels)) + labelsBatch = append(labelsBatch, l) } streamingSeriesBatches = append(streamingSeriesBatches, labelsBatch) diff --git a/pkg/querier/block_streaming.go b/pkg/querier/block_streaming.go index 927c5b2d449..3d21084f9ff 100644 --- a/pkg/querier/block_streaming.go +++ b/pkg/querier/block_streaming.go @@ -17,7 +17,6 @@ import ( "github.com/prometheus/prometheus/tsdb/chunkenc" "github.com/prometheus/prometheus/util/annotations" - "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" "github.com/grafana/mimir/pkg/storage/series" "github.com/grafana/mimir/pkg/storegateway/storegatewaypb" @@ -31,7 +30,7 @@ import ( // Implementation of storage.SeriesSet, based on individual responses from store client. type blockStreamingQuerierSeriesSet struct { - series []*storepb.StreamingSeries + series []labels.Labels streamReader chunkStreamReader // next response to process @@ -55,18 +54,22 @@ func (bqss *blockStreamingQuerierSeriesSet) Next() bool { return false } - currLabels := bqss.series[bqss.nextSeriesIndex].Labels + currLabels := bqss.series[bqss.nextSeriesIndex] seriesIdxStart := bqss.nextSeriesIndex // First series in this group. We might merge with more below. bqss.nextSeriesIndex++ // Chunks may come in multiple responses, but as soon as the response has chunks for a new series, // we can stop searching. Series are sorted. See documentation for StoreClient.Series call for details. // The actually merging of chunks happens in the Iterator() call where chunks are fetched. - for bqss.nextSeriesIndex < len(bqss.series) && mimirpb.CompareLabelAdapters(currLabels, bqss.series[bqss.nextSeriesIndex].Labels) == 0 { + for bqss.nextSeriesIndex < len(bqss.series) && labels.Equal(currLabels, bqss.series[bqss.nextSeriesIndex]) { bqss.nextSeriesIndex++ } - bqss.currSeries = newBlockStreamingQuerierSeries(mimirpb.FromLabelAdaptersToLabels(currLabels), seriesIdxStart, bqss.nextSeriesIndex-1, bqss.streamReader, bqss.chunkInfo, bqss.nextSeriesIndex >= len(bqss.series), bqss.remoteAddress) + bqss.currSeries = newBlockStreamingQuerierSeries(currLabels, seriesIdxStart, bqss.nextSeriesIndex-1, bqss.streamReader, bqss.chunkInfo, bqss.nextSeriesIndex >= len(bqss.series), bqss.remoteAddress) + + // Clear any labels we no longer need, to allow them to be garbage collected when they're no longer needed elsewhere. + clear(bqss.series[seriesIdxStart : bqss.nextSeriesIndex-1]) + return true } diff --git a/pkg/querier/block_streaming_test.go b/pkg/querier/block_streaming_test.go index 501fa8f7c43..48b62329aa9 100644 --- a/pkg/querier/block_streaming_test.go +++ b/pkg/querier/block_streaming_test.go @@ -19,7 +19,6 @@ import ( "go.uber.org/atomic" "google.golang.org/grpc/metadata" - "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" "github.com/grafana/mimir/pkg/storegateway/storepb" "github.com/grafana/mimir/pkg/util/limiter" @@ -166,9 +165,7 @@ func TestBlockStreamingQuerierSeriesSet(t *testing.T) { t.Run(name, func(t *testing.T) { ss := &blockStreamingQuerierSeriesSet{streamReader: &mockChunkStreamer{series: c.input, causeError: c.errorChunkStreamer}} for _, s := range c.input { - ss.series = append(ss.series, &storepb.StreamingSeries{ - Labels: mimirpb.FromLabelsToLabelAdapters(s.lbls), - }) + ss.series = append(ss.series, s.lbls) } idx := 0 var it chunkenc.Iterator diff --git a/pkg/querier/blocks_store_queryable.go b/pkg/querier/blocks_store_queryable.go index 9ce696b3553..a6f17182594 100644 --- a/pkg/querier/blocks_store_queryable.go +++ b/pkg/querier/blocks_store_queryable.go @@ -780,9 +780,9 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(ctx context.Context, sp *stor return err } - // A storegateway client will only fill either of mySeries or myStreamingSeries, and not both. + // A storegateway client will only fill either of mySeries or myStreamingSeriesLabels, and not both. mySeries := []*storepb.Series(nil) - myStreamingSeries := []*storepb.StreamingSeries(nil) + myStreamingSeriesLabels := []labels.Labels(nil) var myWarnings annotations.Annotations myQueriedBlocks := []ulid.ULID(nil) indexBytesFetched := uint64(0) @@ -813,7 +813,7 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(ctx context.Context, sp *stor mySeries = append(mySeries, s) // Add series fingerprint to query limiter; will return error if we are over the limit - if err := queryLimiter.AddSeries(s.Labels); err != nil { + if err := queryLimiter.AddSeries(mimirpb.FromLabelAdaptersToLabels(s.Labels)); err != nil { return err } @@ -853,16 +853,22 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(ctx context.Context, sp *stor } if ss := resp.GetStreamingSeries(); ss != nil { + myStreamingSeriesLabels = slices.Grow(myStreamingSeriesLabels, len(ss.Series)) + for _, s := range ss.Series { // Add series fingerprint to query limiter; will return error if we are over the limit - if limitErr := queryLimiter.AddSeries(s.Labels); limitErr != nil { + l := mimirpb.FromLabelAdaptersToLabels(s.Labels) + + if limitErr := queryLimiter.AddSeries(l); limitErr != nil { return limitErr } + + myStreamingSeriesLabels = append(myStreamingSeriesLabels, l) } - myStreamingSeries = append(myStreamingSeries, ss.Series...) + if ss.IsEndOfSeriesStream { // If we aren't expecting any series from this stream, close it now. - if len(myStreamingSeries) == 0 { + if len(myStreamingSeriesLabels) == 0 { util.CloseAndExhaust[*storepb.SeriesResponse](stream) //nolint:errcheck } @@ -904,13 +910,13 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(ctx context.Context, sp *stor chunkInfo.EndSeries(i == len(mySeries)-1) } } - } else if len(myStreamingSeries) > 0 { + } else if len(myStreamingSeriesLabels) > 0 { // FetchedChunks and FetchedChunkBytes are added by the SeriesChunksStreamReader. - reqStats.AddFetchedSeries(uint64(len(myStreamingSeries))) - streamReader = newStoreGatewayStreamReader(reqCtx, stream, len(myStreamingSeries), queryLimiter, reqStats, q.metrics, q.logger) + reqStats.AddFetchedSeries(uint64(len(myStreamingSeriesLabels))) + streamReader = newStoreGatewayStreamReader(reqCtx, stream, len(myStreamingSeriesLabels), queryLimiter, reqStats, q.metrics, q.logger) level.Debug(log).Log("msg", "received streaming series from store-gateway", "instance", c.RemoteAddress(), - "fetched series", len(myStreamingSeries), + "fetched series", len(myStreamingSeriesLabels), "fetched index bytes", indexBytesFetched, "requested blocks", strings.Join(convertULIDsToString(blockIDs), " "), "queried blocks", strings.Join(convertULIDsToString(myQueriedBlocks), " ")) @@ -925,12 +931,12 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(ctx context.Context, sp *stor mtx.Lock() if len(mySeries) > 0 { seriesSets = append(seriesSets, &blockQuerierSeriesSet{series: mySeries}) - } else if len(myStreamingSeries) > 0 { + } else if len(myStreamingSeriesLabels) > 0 { if chunkInfo != nil { chunkInfo.SetMsg("store-gateway streaming") } seriesSets = append(seriesSets, &blockStreamingQuerierSeriesSet{ - series: myStreamingSeries, + series: myStreamingSeriesLabels, streamReader: streamReader, chunkInfo: chunkInfo, remoteAddress: c.RemoteAddress(), diff --git a/pkg/util/limiter/query_limiter.go b/pkg/util/limiter/query_limiter.go index fcd61b88869..0a7333f7721 100644 --- a/pkg/util/limiter/query_limiter.go +++ b/pkg/util/limiter/query_limiter.go @@ -9,9 +9,9 @@ import ( "context" "sync" + "github.com/prometheus/prometheus/model/labels" "go.uber.org/atomic" - "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" "github.com/grafana/mimir/pkg/util/validation" ) @@ -74,12 +74,12 @@ func QueryLimiterFromContextWithFallback(ctx context.Context) *QueryLimiter { } // AddSeries adds the input series and returns an error if the limit is reached. -func (ql *QueryLimiter) AddSeries(seriesLabels []mimirpb.LabelAdapter) validation.LimitError { +func (ql *QueryLimiter) AddSeries(seriesLabels labels.Labels) validation.LimitError { // If the max series is unlimited just return without managing map if ql.maxSeriesPerQuery == 0 { return nil } - fingerprint := mimirpb.FromLabelAdaptersToLabels(seriesLabels).Hash() + fingerprint := seriesLabels.Hash() ql.uniqueSeriesMx.Lock() defer ql.uniqueSeriesMx.Unlock() diff --git a/pkg/util/limiter/query_limiter_test.go b/pkg/util/limiter/query_limiter_test.go index 0d8041e4e4d..73179122241 100644 --- a/pkg/util/limiter/query_limiter_test.go +++ b/pkg/util/limiter/query_limiter_test.go @@ -16,7 +16,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" ) @@ -37,15 +36,15 @@ func TestQueryLimiter_AddSeries_ShouldReturnNoErrorOnLimitNotExceeded(t *testing reg = prometheus.NewPedanticRegistry() limiter = NewQueryLimiter(100, 0, 0, 0, stats.NewQueryMetrics(reg)) ) - err := limiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(series1)) + err := limiter.AddSeries(series1) assert.NoError(t, err) - err = limiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(series2)) + err = limiter.AddSeries(series2) assert.NoError(t, err) assert.Equal(t, 2, limiter.uniqueSeriesCount()) assertRejectedQueriesMetricValue(t, reg, 0, 0, 0, 0) // Re-add previous series to make sure it's not double counted - err = limiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(series1)) + err = limiter.AddSeries(series1) assert.NoError(t, err) assert.Equal(t, 2, limiter.uniqueSeriesCount()) assertRejectedQueriesMetricValue(t, reg, 0, 0, 0, 0) @@ -72,21 +71,21 @@ func TestQueryLimiter_AddSeries_ShouldReturnErrorOnLimitExceeded(t *testing.T) { reg = prometheus.NewPedanticRegistry() limiter = NewQueryLimiter(1, 0, 0, 0, stats.NewQueryMetrics(reg)) ) - err := limiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(series1)) + err := limiter.AddSeries(series1) require.NoError(t, err) assertRejectedQueriesMetricValue(t, reg, 0, 0, 0, 0) - err = limiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(series2)) + err = limiter.AddSeries(series2) require.Error(t, err) assertRejectedQueriesMetricValue(t, reg, 1, 0, 0, 0) // Add the same series again and ensure that we don't increment the failed queries metric again. - err = limiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(series2)) + err = limiter.AddSeries(series2) require.Error(t, err) assertRejectedQueriesMetricValue(t, reg, 1, 0, 0, 0) // Add another series and ensure that we don't increment the failed queries metric again. - err = limiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(series3)) + err = limiter.AddSeries(series3) require.Error(t, err) assertRejectedQueriesMetricValue(t, reg, 1, 0, 0, 0) } @@ -188,7 +187,7 @@ func BenchmarkQueryLimiter_AddSeries(b *testing.B) { reg := prometheus.NewPedanticRegistry() limiter := NewQueryLimiter(b.N+1, 0, 0, 0, stats.NewQueryMetrics(reg)) for _, s := range series { - err := limiter.AddSeries(mimirpb.FromLabelsToLabelAdapters(s)) + err := limiter.AddSeries(s) assert.NoError(b, err) } } From 078e689bded77b0fecce0fbe839044fb17c9c415 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 15:36:23 +0100 Subject: [PATCH 16/32] add per tenant cost attribution label limit --- pkg/util/validation/limits.go | 21 ++++++++++++++++++++- pkg/util/validation/limits_test.go | 6 ++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 5d85174ce4f..cb4c54f3efd 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -61,6 +61,8 @@ const ( QueryIngestersWithinFlag = "querier.query-ingesters-within" AlertmanagerMaxGrafanaConfigSizeFlag = "alertmanager.max-grafana-config-size-bytes" AlertmanagerMaxGrafanaStateSizeFlag = "alertmanager.max-grafana-state-size-bytes" + costAttributionLabelsFlag = "validation.cost-attribution-labels" + maxCostAttributionLabelsPerUserFlag = "validation.max-cost-attribution-labels-per-user" // MinCompactorPartialBlockDeletionDelay is the minimum partial blocks deletion delay that can be configured in Mimir. MinCompactorPartialBlockDeletionDelay = 4 * time.Hour @@ -69,6 +71,7 @@ const ( var ( errInvalidIngestStorageReadConsistency = fmt.Errorf("invalid ingest storage read consistency (supported values: %s)", strings.Join(api.ReadConsistencies, ", ")) errInvalidMaxEstimatedChunksPerQueryMultiplier = errors.New("invalid value for -" + MaxEstimatedChunksPerQueryMultiplierFlag + ": must be 0 or greater than or equal to 1") + errSurpassingCostAttributionLabelsLimit = errors.New("invalid value for -" + costAttributionLabelsFlag + ": should not surpassing the limit defined by -" + maxCostAttributionLabelsPerUserFlag) ) // LimitError is a marker interface for the errors that do not comply with the specified limits. @@ -185,7 +188,9 @@ type Limits struct { // Cost attribution and limit. CostAttributionLabels flagext.StringSliceCSV `yaml:"cost_attribution_labels" json:"cost_attribution_labels" category:"experimental"` + MaxCostAttributionLabelsPerUser int `yaml:"max_cost_attribution_labels_per_user" json:"max_cost_attribution_labels_per_user" category:"experimental"` MaxCostAttributionCardinalityPerUser int `yaml:"max_cost_attribution_cardinality_per_user" json:"max_cost_attribution_cardinality_per_user" category:"experimental"` + CostAttributionCooldown model.Duration `yaml:"cost_attribution_cooldown" json:"cost_attribution_cooldown" category:"experimental"` // Ruler defaults and limits. RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` @@ -294,8 +299,10 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") - f.Var(&l.CostAttributionLabels, "validation.cost-attribution-labels", "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.") + f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.") + f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user. 0 to disable.") f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") + f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. This specifies how long the cost attribution tracker remains in overflow before attempting a reset. If the tracker is still in overflow after this period, the cooldown will be extended. Set to 0 to disable the cooldown period.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -469,6 +476,10 @@ func (l *Limits) validate() error { return errInvalidIngestStorageReadConsistency } + if l.MaxCostAttributionLabelsPerUser != 0 && len(l.CostAttributionLabels) > l.MaxCostAttributionLabelsPerUser { + return errSurpassingCostAttributionLabelsLimit + } + return nil } @@ -789,6 +800,14 @@ func (o *Overrides) CostAttributionLabels(userID string) []string { return o.getOverridesForUser(userID).CostAttributionLabels } +func (o *Overrides) MaxCostAttributionLabelsPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionLabelsPerUser +} + +func (o *Overrides) CostAttributionCooldown(userID string) time.Duration { + return time.Duration(o.getOverridesForUser(userID).CostAttributionCooldown) +} + func (o *Overrides) MaxCostAttributionCardinalityPerUser(userID string) int { return o.getOverridesForUser(userID).MaxCostAttributionCardinalityPerUser } diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 6b5f9121923..fcad70a6c28 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -1076,6 +1076,12 @@ metric_relabel_configs: cfg: `ingest_storage_read_consistency: xyz`, expectedErr: errInvalidIngestStorageReadConsistency.Error(), }, + "should fail when cost_attribution_labels length exceeds max_cost_attribution_labels_per_user config": { + cfg: ` +cost_attribution_labels: label1, label2, label3, +max_cost_attribution_labels_per_user: 2`, + expectedErr: errSurpassingCostAttributionLabelsLimit.Error(), + }, } for testName, testData := range tests { From 4bf418a9a7c6151f37731315844553f983650af2 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 17:02:29 +0100 Subject: [PATCH 17/32] update doc --- cmd/mimir/config-descriptor.json | 22 +++++++++++++++++++ cmd/mimir/help-all.txt.tmpl | 4 ++++ .../configuration-parameters/index.md | 12 ++++++++++ 3 files changed, 38 insertions(+) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 1715445cd57..835ac2c2753 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4369,6 +4369,17 @@ "fieldType": "string", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "max_cost_attribution_labels_per_user", + "required": false, + "desc": "Maximum number of cost attribution labels allowed per user. 0 to disable.", + "fieldValue": null, + "fieldDefaultValue": 2, + "fieldFlag": "validation.max-cost-attribution-labels-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "max_cost_attribution_cardinality_per_user", @@ -4380,6 +4391,17 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "cost_attribution_cooldown", + "required": false, + "desc": "Cooldown period for cost attribution labels. This specifies how long the cost attribution tracker remains in overflow before attempting a reset. If the tracker is still in overflow after this period, the cooldown will be extended. Set to 0 to disable the cooldown period.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "validation.cost-attribution-cooldown", + "fieldType": "duration", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "ruler_evaluation_delay_duration", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 6daa0f7d3d2..4ef7b5629ef 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -3301,6 +3301,8 @@ Usage of ./cmd/mimir/mimir: Enable anonymous usage reporting. (default true) -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") + -validation.cost-attribution-cooldown duration + [experimental] Cooldown period for cost attribution labels. This specifies how long the cost attribution tracker remains in overflow before attempting a reset. If the tracker is still in overflow after this period, the cooldown will be extended. Set to 0 to disable the cooldown period. -validation.cost-attribution-labels comma-separated-list-of-strings [experimental] List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution. -validation.create-grace-period duration @@ -3309,6 +3311,8 @@ Usage of ./cmd/mimir/mimir: Enforce every metadata has a metric name. (default true) -validation.max-cost-attribution-cardinality-per-user int [experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000) + -validation.max-cost-attribution-labels-per-user int + [experimental] Maximum number of cost attribution labels allowed per user. 0 to disable. (default 2) -validation.max-label-names-per-series int Maximum number of label names per series. (default 30) -validation.max-length-label-name int diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 590b4d6892c..7d8c11a44df 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3560,11 +3560,23 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -validation.cost-attribution-labels [cost_attribution_labels: | default = ""] +# (experimental) Maximum number of cost attribution labels allowed per user. 0 +# to disable. +# CLI flag: -validation.max-cost-attribution-labels-per-user +[max_cost_attribution_labels_per_user: | default = 2] + # (experimental) Maximum cardinality of cost attribution labels allowed per # user. # CLI flag: -validation.max-cost-attribution-cardinality-per-user [max_cost_attribution_cardinality_per_user: | default = 10000] +# (experimental) Cooldown period for cost attribution labels. This specifies how +# long the cost attribution tracker remains in overflow before attempting a +# reset. If the tracker is still in overflow after this period, the cooldown +# will be extended. Set to 0 to disable the cooldown period. +# CLI flag: -validation.cost-attribution-cooldown +[cost_attribution_cooldown: | default = 0s] + # Duration to delay the evaluation of rules to ensure the underlying metrics # have been pushed. # CLI flag: -ruler.evaluation-delay-duration From 5e9e1c19f8421a735b7d62e6f0b0ddff555ac493 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 17:35:51 +0100 Subject: [PATCH 18/32] fix unittest --- pkg/costattribution/manager.go | 13 ++++++++----- pkg/costattribution/tracker.go | 20 +++++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 290eeab2a63..56f2337de87 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -74,7 +74,7 @@ func (m *Manager) TrackerForUser(userID string) *Tracker { // if not exists, create a new tracker if _, exists := m.trackersByUserID[userID]; !exists { - m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) + m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID)) } return m.trackersByUserID[userID] } @@ -183,13 +183,16 @@ func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64 // if they are different, we need to update the tracker, we don't mind, just reinitialized the tracker if !CompareCALabels(cat.CALabels(), newTrackedLabels) { m.mtx.Lock() - m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID)) + m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID)) // update the tracker with the new tracker cat = m.trackersByUserID[userID] m.mtx.Unlock() - } else if maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID); cat.MaxCardinality() != maxCardinality { - // if the maxCardinality is different, update the tracker - cat.UpdateMaxCardinality(maxCardinality) + } else { + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + // cooldown := m.limits.CostAttributionCooldown(userID) + if cat.MaxCardinality() != maxCardinality { + cat.UpdateMaxCardinality(maxCardinality) + } } return cat.PurgeInactiveObservations(deadline) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 6898b7c6ee2..37e60026098 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -40,7 +40,7 @@ type Tracker struct { cooldownDuration int64 } -func newTracker(userID string, trackedLabels []string, limit int) (*Tracker, error) { +func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration) (*Tracker, error) { // keep tracked labels sorted for consistent metric labels sort.Slice(trackedLabels, func(i, j int) bool { return trackedLabels[i] < trackedLabels[j] @@ -173,7 +173,10 @@ func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64, reason *string) []s values[len(values)-1] = *reason } var stream uint64 - stream, t.hashBuffer = lbls.HashForLabels(t.hashBuffer, t.caLabels...) + stream, _ = lbls.HashForLabels(t.hashBuffer, t.caLabels...) + if reason == nil { + values = values[:len(values)-1] + } if t.overflow(stream, values, ts) { // Omit last label. for i := range values[:len(values)-2] { @@ -181,9 +184,6 @@ func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64, reason *string) []s } } - if reason == nil { - return values[:len(values)-1] - } return values } @@ -201,12 +201,10 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { if o, known := t.observed[stream]; known && o.lastUpdate != nil && o.lastUpdate.Load() < ts { o.lastUpdate.Store(ts) - } else { - if !t.isOverflow { - t.observed[stream] = &Observation{ - lvalues: values, - lastUpdate: atomic.NewInt64(ts), - } + } else if len(t.observed) <= t.maxCardinality { + t.observed[stream] = &Observation{ + lvalues: values, + lastUpdate: atomic.NewInt64(ts), } } From bd3e112b028b7333ad2cd556b75886c3c2bc3eeb Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 17:53:48 +0100 Subject: [PATCH 19/32] fix ci --- pkg/costattribution/tracker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 37e60026098..c3a59bf6fdd 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -70,7 +70,7 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, }, append(trackedLabels, TenantLabel)), hashBuffer: make([]byte, 0, 1024), - cooldownDuration: int64((time.Minute * 20).Seconds()), + cooldownDuration: int64(cooldown.Seconds()), } return m, nil } From cf16611a9322c8ee92cbf5911e550b37a2a88997 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 18:52:16 +0100 Subject: [PATCH 20/32] fix ci --- pkg/costattribution/tracker.go | 38 +++++++++++++++++------------ pkg/costattribution/tracker_test.go | 14 +++++------ 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index c3a59bf6fdd..486a79a01f6 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -95,7 +95,14 @@ func (t *Tracker) cleanupTrackerAttribution(vals []string) { } t.activeSeriesPerUserAttribution.DeleteLabelValues(vals...) t.receivedSamplesAttribution.DeleteLabelValues(vals...) - t.discardedSampleAttribution.DeleteLabelValues(vals...) + + // except for discarded sample metrics, there is reason label that is not part of the key, we need to delete all partial matches + filter := prometheus.Labels{} + for i := 0; i < len(t.caLabels); i++ { + filter[t.caLabels[i]] = vals[i] + } + filter[TenantLabel] = vals[len(vals)-1] + t.discardedSampleAttribution.DeletePartialMatch(filter) } func (t *Tracker) cleanupTracker(userID string) { @@ -112,7 +119,7 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { if t == nil { return } - vals := t.getKeyValues(lbs, now.Unix(), nil) + vals := t.getKeyValues(lbs, now.Unix()) t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() } @@ -120,7 +127,7 @@ func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { if t == nil { return } - vals := t.getKeyValues(lbs, now.Unix(), nil) + vals := t.getKeyValues(lbs, now.Unix()) t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() } @@ -128,7 +135,12 @@ func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, re if t == nil { return } - vals := t.getKeyValues(lbs, now.Unix(), &reason) + vals := t.getKeyValues(lbs, now.Unix()) + if t.isOverflow { + vals = append(vals, overflowValue) + } else { + vals = append(vals, reason) + } t.discardedSampleAttribution.WithLabelValues(vals...).Add(value) } @@ -136,7 +148,7 @@ func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now if t == nil { return } - vals := t.getKeyValues(lbs, now.Unix(), nil) + vals := t.getKeyValues(lbs, now.Unix()) t.receivedSamplesAttribution.WithLabelValues(vals...).Add(value) } @@ -157,33 +169,27 @@ func (t *Tracker) Describe(chan<- *prometheus.Desc) { } } -func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64, reason *string) []string { +func (t *Tracker) getKeyValues(lbls labels.Labels, ts int64) []string { if t == nil { return nil } - values := make([]string, len(t.caLabels)+2) + values := make([]string, len(t.caLabels)+1) for i, l := range t.caLabels { values[i] = lbls.Get(l) if values[i] == "" { values[i] = missingValue } } - values[len(values)-2] = t.userID - if reason != nil { - values[len(values)-1] = *reason - } + values[len(values)-1] = t.userID var stream uint64 stream, _ = lbls.HashForLabels(t.hashBuffer, t.caLabels...) - if reason == nil { - values = values[:len(values)-1] - } + if t.overflow(stream, values, ts) { // Omit last label. - for i := range values[:len(values)-2] { + for i := range values[:len(values)-1] { values[i] = overflowValue } } - return values } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 1b8f7c6995f..c71e5569cdf 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -66,7 +66,6 @@ func Test_CreateCleanupTracker(t *testing.T) { // Clean up the metrics with label values platform="foo" tenant="user1" cat.cleanupTrackerAttribution([]string{"foo", "user4"}) - cat.cleanupTrackerAttribution([]string{"foo", "user4", "sample-out-of-order"}) expectedMetrics = ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. @@ -84,17 +83,16 @@ func Test_GetKeyValues(t *testing.T) { cat := newTestManager().TrackerForUser("user3") // Test initial key values and overflow states - keyVal1 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar"), 1, nil) + keyVal1 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar"), 1) assert.Equal(t, []string{"foo", "bar", "user3"}, keyVal1, "First call, expecting values as-is") - keyVal2 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "baz"), 2, nil) + keyVal2 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "baz"), 2) assert.Equal(t, []string{"foo", "baz", "user3"}, keyVal2, "Second call, expecting values as-is") - reason := "sample out of order" - keyVal3 := cat.getKeyValues(labels.FromStrings("department", "foo"), 3, &reason) - assert.Equal(t, []string{"foo", "__missing__", "user3", "sample out of order"}, keyVal3, "Service missing, should return '__missing__'") + keyVal3 := cat.getKeyValues(labels.FromStrings("department", "foo"), 3) + assert.Equal(t, []string{"foo", "__missing__", "user3"}, keyVal3, "Service missing, should return '__missing__'") - keyVal4 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar", "team", "a"), 4, nil) + keyVal4 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar", "team", "a"), 4) assert.Equal(t, []string{"__overflow__", "__overflow__", "user3"}, keyVal4, "Overflow state expected") } @@ -140,7 +138,7 @@ func Test_PurgeInactiveObservations(t *testing.T) { // Check that the purged observation matches the expected details. assert.Equal(t, int64(1), purged[0].lastUpdate.Load()) - assert.Equal(t, []string{"foo", "user1", "invalid-metrics-name"}, purged[0].lvalues) + assert.Equal(t, []string{"foo", "user1"}, purged[0].lvalues) // Verify that only one observation remains in the tracker. Confirm that the remaining observation has the correct last update timestamp. require.Len(t, cat.observed, 1) From 3c1f886c0758476b5f4e2ff9a1750fe933823187 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 18:56:08 +0100 Subject: [PATCH 21/32] remove unrelated changes --- pkg/mimir/mimir.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 6b78c476eb5..9f4476f0d68 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -709,15 +709,14 @@ type Mimir struct { ServiceMap map[string]services.Service ModuleManager *modules.Manager - API *api.API - Server *server.Server - IngesterRing *ring.Ring - IngesterPartitionRingWatcher *ring.PartitionRingWatcher - IngesterPartitionInstanceRing *ring.PartitionInstanceRing - TenantLimits validation.TenantLimits - Overrides *validation.Overrides - ActiveGroupsCleanup *util.ActiveGroupsCleanupService - + API *api.API + Server *server.Server + IngesterRing *ring.Ring + IngesterPartitionRingWatcher *ring.PartitionRingWatcher + IngesterPartitionInstanceRing *ring.PartitionInstanceRing + TenantLimits validation.TenantLimits + Overrides *validation.Overrides + ActiveGroupsCleanup *util.ActiveGroupsCleanupService Distributor *distributor.Distributor Ingester *ingester.Ingester Flusher *flusher.Flusher From d0cb1f3908071c664f3431c1a75ecc5ba2b7edd9 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 20:38:28 +0100 Subject: [PATCH 22/32] update purge logics --- pkg/costattribution/manager.go | 14 ++++++------- pkg/costattribution/tracker.go | 31 +++++++++++++++++++++-------- pkg/costattribution/tracker_test.go | 16 +++++++-------- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 56f2337de87..9d79d03efb5 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -138,12 +138,9 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) { // if the tracker is no longer overflowed, and it is currently in overflow state, check the cooldown and create new tracker cat := m.TrackerForUser(userID) - if cat != nil && cat.isOverflow { - if len(cat.observed) < cat.MaxCardinality() { - if cat.cooldownUntil.Load() < deadline { - m.deleteUserTracer(userID) - continue - } + if cat != nil && cat.cooldownUntil != nil && cat.cooldownUntil.Load() < deadline { + if len(cat.observed) <= cat.MaxCardinality() { + m.deleteUserTracer(userID) } else { cat.cooldownUntil.Store(deadline + cat.cooldownDuration) } @@ -189,10 +186,13 @@ func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64 m.mtx.Unlock() } else { maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) - // cooldown := m.limits.CostAttributionCooldown(userID) + cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) if cat.MaxCardinality() != maxCardinality { cat.UpdateMaxCardinality(maxCardinality) } + if cooldown != cat.CooldownDuration() { + cat.UpdateCooldownDuration(cooldown) + } } return cat.PurgeInactiveObservations(deadline) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 486a79a01f6..a04ebefcbf2 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -89,6 +89,13 @@ func (t *Tracker) MaxCardinality() int { return t.maxCardinality } +func (t *Tracker) CooldownDuration() int64 { + if t == nil { + return 0 + } + return t.cooldownDuration +} + func (t *Tracker) cleanupTrackerAttribution(vals []string) { if t == nil { return @@ -198,22 +205,23 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { return false } - // If the maximum cardinality is hit all streams become `__overflow__`, the function would return true. - // the origin labels ovserved time is not updated, but the overflow hash is updated. - if len(t.observed) > t.maxCardinality { - t.isOverflow = true - t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) - } - + // we store up to 2 * maxCardinality observations, if we have seen the stream before, we update the last update time if o, known := t.observed[stream]; known && o.lastUpdate != nil && o.lastUpdate.Load() < ts { o.lastUpdate.Store(ts) - } else if len(t.observed) <= t.maxCardinality { + } else if len(t.observed) < t.maxCardinality*2 { t.observed[stream] = &Observation{ lvalues: values, lastUpdate: atomic.NewInt64(ts), } } + // If the maximum cardinality is hit all streams become `__overflow__`, the function would return true. + // the origin labels ovserved time is not updated, but the overflow hash is updated. + if !t.isOverflow && len(t.observed) > t.maxCardinality { + t.isOverflow = true + t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) + } + return t.isOverflow } @@ -261,3 +269,10 @@ func (t *Tracker) UpdateMaxCardinality(limit int) { } t.maxCardinality = limit } + +func (t *Tracker) UpdateCooldownDuration(cooldownDuration int64) { + if t == nil { + return + } + t.cooldownDuration = cooldownDuration +} diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index c71e5569cdf..d4420d12d72 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -86,13 +86,13 @@ func Test_GetKeyValues(t *testing.T) { keyVal1 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar"), 1) assert.Equal(t, []string{"foo", "bar", "user3"}, keyVal1, "First call, expecting values as-is") - keyVal2 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "baz"), 2) - assert.Equal(t, []string{"foo", "baz", "user3"}, keyVal2, "Second call, expecting values as-is") + keyVal2 := cat.getKeyValues(labels.FromStrings("department", "foo"), 3) + assert.Equal(t, []string{"foo", "__missing__", "user3"}, keyVal2, "Service missing, should return '__missing__'") - keyVal3 := cat.getKeyValues(labels.FromStrings("department", "foo"), 3) - assert.Equal(t, []string{"foo", "__missing__", "user3"}, keyVal3, "Service missing, should return '__missing__'") + keyVal3 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "baz", "team", "a"), 4) + assert.Equal(t, []string{"__overflow__", "__overflow__", "user3"}, keyVal3, "Overflow state expected") - keyVal4 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar", "team", "a"), 4) + keyVal4 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar"), 5) assert.Equal(t, []string{"__overflow__", "__overflow__", "user3"}, keyVal4, "Overflow state expected") } @@ -109,9 +109,9 @@ func Test_Overflow(t *testing.T) { assert.False(t, cat.overflow(stream1, []string{"foo", "bar", "user1"}, 1), "First observation, should not overflow") assert.False(t, cat.overflow(stream2, []string{"bar", "baz", "user1"}, 2), "Second observation, should not overflow") - assert.False(t, cat.overflow(stream3, []string{"baz", "foo", "user1"}, 3), "Third observation, should not overflow") - assert.True(t, cat.overflow(stream3, []string{"baz", "foo", "user1"}, 4), "Fourth observation, should overflow") - assert.Equal(t, int64(4+cat.cooldownDuration), cat.cooldownUntil.Load(), "CooldownUntil should be updated correctly") + assert.True(t, cat.overflow(stream3, []string{"baz", "foo", "user1"}, 3), "Third observation didn't seen before, should overflow") + assert.True(t, cat.overflow(stream3, []string{"baz", "foo", "user1"}, 4), "Fourth observation, should stay overflow") + assert.Equal(t, int64(3+cat.cooldownDuration), cat.cooldownUntil.Load(), "CooldownUntil should be updated correctly") } func Test_PurgeInactiveObservations(t *testing.T) { From 5d4a2c4b3d96e4695206d31db83bf41e081c8b5c Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 21:33:10 +0100 Subject: [PATCH 23/32] fix ci --- pkg/costattribution/tracker_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index d4420d12d72..c206c8799a7 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -142,8 +142,10 @@ func Test_PurgeInactiveObservations(t *testing.T) { // Verify that only one observation remains in the tracker. Confirm that the remaining observation has the correct last update timestamp. require.Len(t, cat.observed, 1) - assert.NotNil(t, cat.observed[observations[1].Hash()].lastUpdate) - assert.Equal(t, int64(12), cat.observed[observations[1].Hash()].lastUpdate.Load()) + ob := cat.observed[observations[1].Hash()] + assert.NotNil(t, ob) + assert.NotNil(t, ob.lastUpdate) + assert.Equal(t, int64(12), ob.lastUpdate.Load()) } func Test_UpdateMaxCardinality(t *testing.T) { From 6091493a68ce8c9b1991acca8a19b8f9632744ad Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 22:25:15 +0100 Subject: [PATCH 24/32] fix ci --- pkg/costattribution/tracker.go | 5 ++--- pkg/costattribution/tracker_test.go | 10 ++-------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index a04ebefcbf2..00caa31b4fd 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -232,6 +232,8 @@ func (t *Tracker) PurgeInactiveObservations(deadline int64) []*Observation { // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []uint64 + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() for labHash, ob := range t.observed { if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { invalidKeys = append(invalidKeys, labHash) @@ -242,9 +244,6 @@ func (t *Tracker) PurgeInactiveObservations(deadline int64) []*Observation { return nil } - t.obseveredMtx.Lock() - defer t.obseveredMtx.Unlock() - // Cleanup inactive observations and return all invalid observations to clean up metrics for them res := make([]*Observation, len(invalidKeys)) for i := 0; i < len(invalidKeys); { diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index c206c8799a7..73409d5f606 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -131,7 +131,8 @@ func Test_PurgeInactiveObservations(t *testing.T) { require.Len(t, cat.observed, 2) // Purge observations that haven't been updated in the last 10 seconds. - purged := cat.PurgeInactiveObservations(10) + purged := cat.PurgeInactiveObservations(5) + require.Len(t, cat.observed, 1) // Verify that only one observation was purged. require.Len(t, purged, 1) @@ -139,13 +140,6 @@ func Test_PurgeInactiveObservations(t *testing.T) { // Check that the purged observation matches the expected details. assert.Equal(t, int64(1), purged[0].lastUpdate.Load()) assert.Equal(t, []string{"foo", "user1"}, purged[0].lvalues) - - // Verify that only one observation remains in the tracker. Confirm that the remaining observation has the correct last update timestamp. - require.Len(t, cat.observed, 1) - ob := cat.observed[observations[1].Hash()] - assert.NotNil(t, ob) - assert.NotNil(t, ob.lastUpdate) - assert.Equal(t, int64(12), ob.lastUpdate.Load()) } func Test_UpdateMaxCardinality(t *testing.T) { From 7324a1dce31343b89062a0ef82e7cd7d05f689d1 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 18 Nov 2024 23:36:35 +0100 Subject: [PATCH 25/32] update logic for overflow, purge other metrics than overflow --- pkg/costattribution/manager.go | 18 +++++++----------- pkg/costattribution/manager_test.go | 13 +++++++++++++ pkg/costattribution/tracker.go | 13 +++++++++++-- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 9d79d03efb5..e6ac57d3aac 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -102,12 +102,11 @@ func (m *Manager) deleteUserTracer(userID string) { } m.mtx.Lock() defer m.mtx.Unlock() - if _, exists := m.trackersByUserID[userID]; !exists { - return + if _, exists := m.trackersByUserID[userID]; exists { + // clean up tracker metrics and delete the tracker + m.trackersByUserID[userID].cleanupTracker(userID) + delete(m.trackersByUserID, userID) } - // clean up tracker metrics and delete the tracker - m.trackersByUserID[userID].cleanupTracker(userID) - delete(m.trackersByUserID, userID) } func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) { @@ -125,7 +124,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) { // Iterate over all userIDs and purge inactive attributions of each user for _, userID := range userIDs { // if cost attribution is not enabled for the user, delete the user tracker and continue - if len(m.limits.CostAttributionLabels(userID)) == 0 || m.limits.MaxCostAttributionCardinalityPerUser(userID) <= 0 { + if !m.EnabledForUser(userID) { m.deleteUserTracer(userID) continue } @@ -163,10 +162,6 @@ func CompareCALabels(a, b []string) bool { } func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*Observation { - if m == nil { - return nil - } - cat := m.TrackerForUser(userID) if cat == nil { return nil @@ -186,10 +181,11 @@ func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64 m.mtx.Unlock() } else { maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) - cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) if cat.MaxCardinality() != maxCardinality { cat.UpdateMaxCardinality(maxCardinality) } + + cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) if cooldown != cat.CooldownDuration() { cat.UpdateCooldownDuration(cooldown) } diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 76e32ac89ba..03fe7b679a4 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -133,6 +133,19 @@ func Test_CreateDeleteTracker(t *testing.T) { ` assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) }) + + t.Run("When cost attribution get overflowed, all metrics are purged except overflow metrics", func(t *testing.T) { + // user3 has maximum cardinality of 2, so adding 3rd attribution should trigger overflow + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "bar", "feature", "bar"), 1, time.Unix(15, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "baz", "feature", "baz"), 1, time.Unix(16, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "foo", "feature", "foo"), 1, time.Unix(17, 0)) + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{feature="__overflow__",team="__overflow__",tenant="user3",tracker="custom_attribution"} 2 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) } func Test_PurgeInactiveAttributionsUntil(t *testing.T) { diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 00caa31b4fd..c28ce2badd4 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -127,7 +127,11 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { return } vals := t.getKeyValues(lbs, now.Unix()) - t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() + if t.isOverflow { + t.activeSeriesPerUserAttribution.WithLabelValues(overflowValue).Set(1) + } else { + t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() + } } func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { @@ -135,7 +139,11 @@ func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { return } vals := t.getKeyValues(lbs, now.Unix()) - t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() + if t.isOverflow { + t.activeSeriesPerUserAttribution.WithLabelValues(overflowValue).Set(1) + } else { + t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() + } } func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { @@ -219,6 +227,7 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { // the origin labels ovserved time is not updated, but the overflow hash is updated. if !t.isOverflow && len(t.observed) > t.maxCardinality { t.isOverflow = true + t.cleanupTracker(t.userID) t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) } From 5af48e4221227f837b3b72ec0a704db500e7f21e Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 19 Nov 2024 23:12:38 +0100 Subject: [PATCH 26/32] add distributor benchmark test for push --- pkg/distributor/distributor_test.go | 44 +++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index b155a1fcd04..cefe0be1618 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -54,6 +54,7 @@ import ( "google.golang.org/grpc/metadata" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -1843,11 +1844,11 @@ func BenchmarkDistributor_Push(b *testing.B) { numSeriesPerRequest = 1000 ) ctx := user.InjectOrgID(context.Background(), "user") - tests := map[string]struct { - prepareConfig func(limits *validation.Limits) - prepareSeries func() ([][]mimirpb.LabelAdapter, []mimirpb.Sample) - expectedErr string + prepareConfig func(limits *validation.Limits) + prepareSeries func() ([][]mimirpb.LabelAdapter, []mimirpb.Sample) + expectedErr string + customRegistry *prometheus.Registry }{ "all samples successfully pushed": { prepareConfig: func(*validation.Limits) {}, @@ -1856,7 +1857,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -1867,6 +1868,26 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "", }, + "all samples successfully pushed with cost attribution enabled": { + // we should have the cost attribution on team, attribution crosee team 0, 1, 2, 3 + prepareConfig: func(limits *validation.Limits) { + limits.CostAttributionLabels = []string{"team"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + prepareSeries: func() ([][]mimirpb.LabelAdapter, []mimirpb.Sample) { + metrics := make([][]mimirpb.LabelAdapter, numSeriesPerRequest) + samples := make([]mimirpb.Sample, numSeriesPerRequest) + for i := 0; i < numSeriesPerRequest; i++ { + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) + samples[i] = mimirpb.Sample{ + Value: float64(i), + TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), + } + } + return metrics, samples + }, + customRegistry: prometheus.NewRegistry(), + }, "ingestion rate limit reached": { prepareConfig: func(limits *validation.Limits) { limits.IngestionRate = 1 @@ -1877,7 +1898,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -1897,7 +1918,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(31) + metrics[i] = mkLabels(30, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2047,8 +2068,15 @@ func BenchmarkDistributor_Push(b *testing.B) { overrides, err := validation.NewOverrides(limits, nil) require.NoError(b, err) + var cam *costattribution.Manager + if testData.customRegistry != nil { + cam = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides) + err := testData.customRegistry.Register(cam) + require.NoError(b, err) + } + // Start the distributor. - distributor, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorCfg, clientConfig, overrides, nil, cam, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(b, err) require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) From 203689aac46717842c2eacf0ce0916695e85e685 Mon Sep 17 00:00:00 2001 From: "Grot (@grafanabot)" <43478413+grafanabot@users.noreply.github.com> Date: Wed, 20 Nov 2024 09:52:10 +0000 Subject: [PATCH 27/32] Improve logging at ha_tracker sync operation (#9958) (#9961) Signed-off-by: Nikos Angelopoulos (cherry picked from commit 0caede48ebb4818c2d6de3444986f3c739a7b0c2) Co-authored-by: Nikos Angelopoulos --- pkg/distributor/ha_tracker.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/distributor/ha_tracker.go b/pkg/distributor/ha_tracker.go index d663470d291..3f2bcfcf430 100644 --- a/pkg/distributor/ha_tracker.go +++ b/pkg/distributor/ha_tracker.go @@ -221,8 +221,10 @@ func (h *haTracker) syncHATrackerStateOnStart(ctx context.Context) error { return nil } + level.Info(h.logger).Log("msg", "sync HA state on start: Listing keys from KV Store") keys, err := h.client.List(ctx, "") if err != nil { + level.Error(h.logger).Log("msg", "sync HA state on start: failed to list the keys ", "err", err) return err } From a2f009b0baf7827979dbf83873682db848fca619 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 20 Nov 2024 16:55:41 +0100 Subject: [PATCH 28/32] add benchmark in ingester --- pkg/distributor/distributor_test.go | 23 +++++ pkg/ingester/ingester_test.go | 131 +++++++++++++++++++--------- 2 files changed, 113 insertions(+), 41 deletions(-) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index cefe0be1618..59008a60cef 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -1929,6 +1929,29 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "received a series whose number of labels exceeds the limit", }, + "too many labels limit reached with cost attribution enabled": { + prepareConfig: func(limits *validation.Limits) { + limits.MaxLabelNamesPerSeries = 30 + limits.CostAttributionLabels = []string{"team"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + prepareSeries: func() ([][]mimirpb.LabelAdapter, []mimirpb.Sample) { + metrics := make([][]mimirpb.LabelAdapter, numSeriesPerRequest) + samples := make([]mimirpb.Sample, numSeriesPerRequest) + + for i := 0; i < numSeriesPerRequest; i++ { + metrics[i] = mkLabels(30, "team", strconv.Itoa(i%4)) + samples[i] = mimirpb.Sample{ + Value: float64(i), + TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), + } + } + + return metrics, samples + }, + customRegistry: prometheus.NewRegistry(), + expectedErr: "received a series whose number of labels exceeds the limit", + }, "max label name length limit reached": { prepareConfig: func(limits *validation.Limits) { limits.MaxLabelNameLength = 200 diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 8cb661910ac..714db5955ce 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -59,6 +59,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -3589,53 +3590,97 @@ func TestIngester_Push_DecreaseInactiveSeries(t *testing.T) { } func BenchmarkIngesterPush(b *testing.B) { - registry := prometheus.NewRegistry() - ctx := user.InjectOrgID(context.Background(), userID) + tests := []struct { + name string + limitsCfg func() validation.Limits + customRegistry prometheus.Registerer + }{ + { + name: "ingester push succeeded", + limitsCfg: func() validation.Limits { + limitsCfg := defaultLimitsTestConfig() + limitsCfg.NativeHistogramsIngestionEnabled = true + return limitsCfg + }, + }, + { + name: "ingester push succeeded with cost attribution enabled", + limitsCfg: func() validation.Limits { + limitsCfg := defaultLimitsTestConfig() + limitsCfg.NativeHistogramsIngestionEnabled = true + limitsCfg.CostAttributionLabels = []string{"cpu"} + limitsCfg.MaxCostAttributionCardinalityPerUser = 100 + return limitsCfg + }, + customRegistry: prometheus.NewRegistry(), + }, + } - // Create a mocked ingester - cfg := defaultIngesterTestConfig(b) + for _, t := range tests { + b.Run(t.name, func(b *testing.B) { + registry := prometheus.NewRegistry() + ctx := user.InjectOrgID(context.Background(), userID) - ingester, err := prepareIngesterWithBlocksStorage(b, cfg, nil, registry) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - defer services.StopAndAwaitTerminated(context.Background(), ingester) //nolint:errcheck + // Create a mocked ingester + cfg := defaultIngesterTestConfig(b) - // Wait until the ingester is healthy - test.Poll(b, 100*time.Millisecond, 1, func() interface{} { - return ingester.lifecycler.HealthyInstancesCount() - }) + limitCfg := t.limitsCfg() + overrides, err := validation.NewOverrides(limitCfg, nil) + require.NoError(b, err) - // Push a single time series to set the TSDB min time. - metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} - startTime := util.TimeToMillis(time.Now()) + var cam *costattribution.Manager + if t.customRegistry != nil { + cam = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides) + err = t.customRegistry.Register(cam) + require.NoError(b, err) + } - currTimeReq := mimirpb.ToWriteRequest( - metricLabelAdapters, - []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, - nil, - nil, - mimirpb.API, - ) - _, err = ingester.Push(ctx, currTimeReq) - require.NoError(b, err) + ingester, err := prepareIngesterWithBlockStorageOverridesAndCostAttribution(b, cfg, overrides, nil, "", "", registry, cam) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - const ( - series = 10 - samples = 1 - ) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingester)) + }) + + // Wait until the ingester is healthy + test.Poll(b, 100*time.Millisecond, 1, func() interface{} { + return ingester.lifecycler.HealthyInstancesCount() + }) - allLabels, allSamples := benchmarkData(series) + // Push a single time series to set the TSDB min time. + metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} + startTime := util.TimeToMillis(time.Now()) - b.ResetTimer() - for iter := 0; iter < b.N; iter++ { - // Bump the timestamp on each of our test samples each time round the loop - for j := 0; j < samples; j++ { - for i := range allSamples { - allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) - } - _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) + currTimeReq := mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, + nil, + nil, + mimirpb.API, + ) + _, err = ingester.Push(ctx, currTimeReq) require.NoError(b, err) - } + + const ( + series = 10 + samples = 1 + ) + + allLabels, allSamples := benchmarkData(series) + + b.ResetTimer() + for iter := 0; iter < b.N; iter++ { + // Bump the timestamp on each of our test samples each time round the loop + for j := 0; j < samples; j++ { + for i := range allSamples { + allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + } + _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) + require.NoError(b, err) + } + } + }) } } @@ -6056,10 +6101,14 @@ func prepareIngesterWithBlocksStorageAndLimits(t testing.TB, ingesterCfg Config, } func prepareIngesterWithBlockStorageAndOverrides(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { - return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer) + return prepareIngesterWithBlockStorageOverridesAndCostAttribution(t, ingesterCfg, overrides, ingestersRing, dataDir, bucketDir, registerer, nil) +} + +func prepareIngesterWithBlockStorageOverridesAndCostAttribution(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { + return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer, cam) } -func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { +func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { // Create a data dir if none has been provided. if dataDir == "" { dataDir = t.TempDir() @@ -6080,7 +6129,7 @@ func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, i ingestersRing = createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()) } - ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) + ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, cam, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) if err != nil { return nil, err } From 00d209206e1cdc110d70ca38408cf13183c0bc35 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 22 Nov 2024 17:39:31 +0100 Subject: [PATCH 29/32] refactory benchmark tests --- pkg/distributor/distributor_test.go | 221 +++++++++++++--------------- pkg/ingester/ingester_test.go | 145 ++++++++++-------- 2 files changed, 183 insertions(+), 183 deletions(-) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 59008a60cef..2850bdef553 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -1824,7 +1824,7 @@ func mkLabels(n int, extra ...string) []mimirpb.LabelAdapter { ret[i+1] = mimirpb.LabelAdapter{Name: fmt.Sprintf("name_%d", i), Value: fmt.Sprintf("value_%d", i)} } for i := 0; i < len(extra); i += 2 { - ret[i+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} + ret[i/2+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} } slices.SortFunc(ret, func(a, b mimirpb.LabelAdapter) int { switch { @@ -1844,11 +1844,11 @@ func BenchmarkDistributor_Push(b *testing.B) { numSeriesPerRequest = 1000 ) ctx := user.InjectOrgID(context.Background(), "user") + tests := map[string]struct { - prepareConfig func(limits *validation.Limits) - prepareSeries func() ([][]mimirpb.LabelAdapter, []mimirpb.Sample) - expectedErr string - customRegistry *prometheus.Registry + prepareConfig func(limits *validation.Limits) + prepareSeries func() ([][]mimirpb.LabelAdapter, []mimirpb.Sample) + expectedErr string }{ "all samples successfully pushed": { prepareConfig: func(*validation.Limits) {}, @@ -1868,26 +1868,6 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "", }, - "all samples successfully pushed with cost attribution enabled": { - // we should have the cost attribution on team, attribution crosee team 0, 1, 2, 3 - prepareConfig: func(limits *validation.Limits) { - limits.CostAttributionLabels = []string{"team"} - limits.MaxCostAttributionCardinalityPerUser = 100 - }, - prepareSeries: func() ([][]mimirpb.LabelAdapter, []mimirpb.Sample) { - metrics := make([][]mimirpb.LabelAdapter, numSeriesPerRequest) - samples := make([]mimirpb.Sample, numSeriesPerRequest) - for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) - samples[i] = mimirpb.Sample{ - Value: float64(i), - TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), - } - } - return metrics, samples - }, - customRegistry: prometheus.NewRegistry(), - }, "ingestion rate limit reached": { prepareConfig: func(limits *validation.Limits) { limits.IngestionRate = 1 @@ -1929,29 +1909,6 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "received a series whose number of labels exceeds the limit", }, - "too many labels limit reached with cost attribution enabled": { - prepareConfig: func(limits *validation.Limits) { - limits.MaxLabelNamesPerSeries = 30 - limits.CostAttributionLabels = []string{"team"} - limits.MaxCostAttributionCardinalityPerUser = 100 - }, - prepareSeries: func() ([][]mimirpb.LabelAdapter, []mimirpb.Sample) { - metrics := make([][]mimirpb.LabelAdapter, numSeriesPerRequest) - samples := make([]mimirpb.Sample, numSeriesPerRequest) - - for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(30, "team", strconv.Itoa(i%4)) - samples[i] = mimirpb.Sample{ - Value: float64(i), - TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), - } - } - - return metrics, samples - }, - customRegistry: prometheus.NewRegistry(), - expectedErr: "received a series whose number of labels exceeds the limit", - }, "max label name length limit reached": { prepareConfig: func(limits *validation.Limits) { limits.MaxLabelNameLength = 200 @@ -1962,7 +1919,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long name. - metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx") + metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx", "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -1983,7 +1940,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long value. - metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1)) + metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1), "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2003,7 +1960,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().Add(time.Hour).UnixNano() / int64(time.Millisecond), @@ -2014,7 +1971,7 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "received a sample whose timestamp is too far in the future", }, - "all samples go to metric_relabel_configs": { + "all samples go to metric relabel configs": { prepareConfig: func(limits *validation.Limits) { limits.MetricRelabelConfigs = []*relabel.Config{ { @@ -2031,7 +1988,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2044,85 +2001,111 @@ func BenchmarkDistributor_Push(b *testing.B) { }, } - for testName, testData := range tests { - b.Run(testName, func(b *testing.B) { - // Create an in-memory KV store for the ring with 1 ingester registered. - kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) - b.Cleanup(func() { assert.NoError(b, closer.Close()) }) + costAttributionCases := []struct { + state string + customRegistry *prometheus.Registry + cfg func(limits *validation.Limits) + }{ + { + state: "enabled", + customRegistry: prometheus.NewRegistry(), + cfg: func(limits *validation.Limits) { + limits.CostAttributionLabels = []string{"team"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + }, + { + state: "disabled", + customRegistry: nil, + cfg: func(limits *validation.Limits) {}, + }, + } - err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, - func(_ interface{}) (interface{}, bool, error) { - d := &ring.Desc{} - d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) - return d, true, nil - }, - ) - require.NoError(b, err) - - ingestersRing, err := ring.New(ring.Config{ - KVStore: kv.Config{Mock: kvStore}, - HeartbeatTimeout: 60 * time.Minute, - ReplicationFactor: 1, - }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) - }) + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for testName, testData := range tests { + b.Run(fmt.Sprintf("scenario=%s", testName), func(b *testing.B) { + // Create an in-memory KV store for the ring with 1 ingester registered. + kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) + b.Cleanup(func() { assert.NoError(b, closer.Close()) }) - test.Poll(b, time.Second, 1, func() interface{} { - return ingestersRing.InstancesCount() - }) + err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, + func(_ interface{}) (interface{}, bool, error) { + d := &ring.Desc{} + d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) + return d, true, nil + }, + ) + require.NoError(b, err) + + ingestersRing, err := ring.New(ring.Config{ + KVStore: kv.Config{Mock: kvStore}, + HeartbeatTimeout: 60 * time.Minute, + ReplicationFactor: 1, + }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) + }) - // Prepare the distributor configuration. - var distributorCfg Config - var clientConfig client.Config - limits := validation.Limits{} - flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) - distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" + test.Poll(b, time.Second, 1, func() interface{} { + return ingestersRing.InstancesCount() + }) + + // Prepare the distributor configuration. + var distributorCfg Config + var clientConfig client.Config + limits := validation.Limits{} + flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) + distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" - limits.IngestionRate = float64(rate.Inf) // Unlimited. - testData.prepareConfig(&limits) + limits.IngestionRate = float64(rate.Inf) // Unlimited. + testData.prepareConfig(&limits) - distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { - return &noopIngester{}, nil - }) + distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { + return &noopIngester{}, nil + }) - overrides, err := validation.NewOverrides(limits, nil) - require.NoError(b, err) + caCase.cfg(&limits) + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(b, err) - var cam *costattribution.Manager - if testData.customRegistry != nil { - cam = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides) - err := testData.customRegistry.Register(cam) - require.NoError(b, err) - } + // Initialize the cost attribution manager + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides) + err := caCase.customRegistry.Register(cam) + require.NoError(b, err) + } - // Start the distributor. - distributor, err := New(distributorCfg, clientConfig, overrides, nil, cam, ingestersRing, nil, true, nil, log.NewNopLogger()) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) + // Start the distributor. + distributor, err := New(distributorCfg, clientConfig, overrides, nil, cam, ingestersRing, nil, true, nil, log.NewNopLogger()) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) - }) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) + }) - // Prepare the series to remote write before starting the benchmark. - metrics, samples := testData.prepareSeries() + // Prepare the series to remote write before starting the benchmark. + metrics, samples := testData.prepareSeries() - // Run the benchmark. - b.ReportAllocs() - b.ResetTimer() + // Run the benchmark. + b.ReportAllocs() + b.ResetTimer() - for n := 0; n < b.N; n++ { - _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + for n := 0; n < b.N; n++ { + _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) - if testData.expectedErr == "" && err != nil { - b.Fatalf("no error expected but got %v", err) - } - if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { - b.Fatalf("expected %v error but got %v", testData.expectedErr, err) - } + if testData.expectedErr == "" && err != nil { + b.Fatalf("no error expected but got %v", err) + } + if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { + b.Fatalf("expected %v error but got %v", testData.expectedErr, err) + } + } + }) } }) } diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 714db5955ce..bafb17e68d2 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -3590,95 +3590,112 @@ func TestIngester_Push_DecreaseInactiveSeries(t *testing.T) { } func BenchmarkIngesterPush(b *testing.B) { - tests := []struct { - name string - limitsCfg func() validation.Limits - customRegistry prometheus.Registerer + costAttributionCases := []struct { + state string + limitsCfg func(*validation.Limits) + customRegistry *prometheus.Registry }{ { - name: "ingester push succeeded", - limitsCfg: func() validation.Limits { - limitsCfg := defaultLimitsTestConfig() - limitsCfg.NativeHistogramsIngestionEnabled = true - return limitsCfg + state: "enabled", + limitsCfg: func(*validation.Limits) {}, + customRegistry: nil, + }, + { + state: "disabled", + limitsCfg: func(limits *validation.Limits) { + if limits == nil { + return + } + limits.CostAttributionLabels = []string{"cpu"} + limits.MaxCostAttributionCardinalityPerUser = 100 }, + customRegistry: prometheus.NewRegistry(), }, + } + + tests := []struct { + name string + limitsCfg func() validation.Limits + }{ { - name: "ingester push succeeded with cost attribution enabled", + name: "ingester push succeeded", limitsCfg: func() validation.Limits { limitsCfg := defaultLimitsTestConfig() limitsCfg.NativeHistogramsIngestionEnabled = true - limitsCfg.CostAttributionLabels = []string{"cpu"} - limitsCfg.MaxCostAttributionCardinalityPerUser = 100 return limitsCfg }, - customRegistry: prometheus.NewRegistry(), }, } - for _, t := range tests { - b.Run(t.name, func(b *testing.B) { - registry := prometheus.NewRegistry() - ctx := user.InjectOrgID(context.Background(), userID) + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for _, t := range tests { + b.Run(fmt.Sprintf("scenario=%s", t.name), func(b *testing.B) { + registry := prometheus.NewRegistry() + ctx := user.InjectOrgID(context.Background(), userID) - // Create a mocked ingester - cfg := defaultIngesterTestConfig(b) + // Create a mocked ingester + cfg := defaultIngesterTestConfig(b) - limitCfg := t.limitsCfg() - overrides, err := validation.NewOverrides(limitCfg, nil) - require.NoError(b, err) + limitCfg := t.limitsCfg() + caCase.limitsCfg(&limitCfg) - var cam *costattribution.Manager - if t.customRegistry != nil { - cam = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides) - err = t.customRegistry.Register(cam) - require.NoError(b, err) - } + overrides, err := validation.NewOverrides(limitCfg, nil) + require.NoError(b, err) - ingester, err := prepareIngesterWithBlockStorageOverridesAndCostAttribution(b, cfg, overrides, nil, "", "", registry, cam) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides) + err = caCase.customRegistry.Register(cam) + require.NoError(b, err) + } - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingester)) - }) + ingester, err := prepareIngesterWithBlockStorageOverridesAndCostAttribution(b, cfg, overrides, nil, "", "", registry, cam) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - // Wait until the ingester is healthy - test.Poll(b, 100*time.Millisecond, 1, func() interface{} { - return ingester.lifecycler.HealthyInstancesCount() - }) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingester)) + }) - // Push a single time series to set the TSDB min time. - metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} - startTime := util.TimeToMillis(time.Now()) + // Wait until the ingester is healthy + test.Poll(b, 100*time.Millisecond, 1, func() interface{} { + return ingester.lifecycler.HealthyInstancesCount() + }) - currTimeReq := mimirpb.ToWriteRequest( - metricLabelAdapters, - []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, - nil, - nil, - mimirpb.API, - ) - _, err = ingester.Push(ctx, currTimeReq) - require.NoError(b, err) + // Push a single time series to set the TSDB min time. + metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} + startTime := util.TimeToMillis(time.Now()) + + currTimeReq := mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, + nil, + nil, + mimirpb.API, + ) + _, err = ingester.Push(ctx, currTimeReq) + require.NoError(b, err) - const ( - series = 10 - samples = 1 - ) + const ( + series = 50 + samples = 1 + ) - allLabels, allSamples := benchmarkData(series) + allLabels, allSamples := benchmarkData(series) - b.ResetTimer() - for iter := 0; iter < b.N; iter++ { - // Bump the timestamp on each of our test samples each time round the loop - for j := 0; j < samples; j++ { - for i := range allSamples { - allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + b.ResetTimer() + for iter := 0; iter < b.N; iter++ { + // Bump the timestamp on each of our test samples each time round the loop + for j := 0; j < samples; j++ { + for i := range allSamples { + allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + } + _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) + require.NoError(b, err) + } } - _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) - require.NoError(b, err) - } + }) } }) } From 9fba531aaba0ea74b8420ee30ca215e6cef8dcf5 Mon Sep 17 00:00:00 2001 From: Charles Korn Date: Mon, 25 Nov 2024 11:54:39 +1100 Subject: [PATCH 30/32] MQE: fix issue where subqueries could return series with no points (#9998) (#10003) * MQE: fix issue where subqueries could return series with no points * Add changelog entry * Add more test cases (cherry picked from commit 1afa9f6750da32c166a55427f0b9cbea227d67da) # Conflicts: # CHANGELOG.md --- CHANGELOG.md | 2 +- pkg/streamingpromql/engine_test.go | 16 ++++++++++++++++ pkg/streamingpromql/query.go | 6 ++++++ .../testdata/ours/subqueries.test | 6 ++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4adf5db7f2..54f2b2ea040 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ * [CHANGE] Ingester: Change `-initial-delay` for circuit breakers to begin when the first request is received, rather than at breaker activation. #9842 * [CHANGE] Query-frontend: apply query pruning before query sharding instead of after. #9913 * [CHANGE] Ingester: remove experimental flags `-ingest-storage.kafka.ongoing-records-per-fetch` and `-ingest-storage.kafka.startup-records-per-fetch`. They are removed in favour of `-ingest-storage.kafka.max-buffered-bytes`. #9906 -* [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #9367 #9368 #9398 #9399 #9403 #9417 #9418 #9419 #9420 #9482 #9504 #9505 #9507 #9518 #9531 #9532 #9533 #9553 #9558 #9588 #9589 #9639 #9641 #9642 #9651 #9664 #9681 #9717 #9719 #9724 #9874 +* [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #9367 #9368 #9398 #9399 #9403 #9417 #9418 #9419 #9420 #9482 #9504 #9505 #9507 #9518 #9531 #9532 #9533 #9553 #9558 #9588 #9589 #9639 #9641 #9642 #9651 #9664 #9681 #9717 #9719 #9724 #9874 #9998 * [FEATURE] Distributor: Add support for `lz4` OTLP compression. #9763 * [FEATURE] Query-frontend: added experimental configuration options `query-frontend.cache-errors` and `query-frontend.results-cache-ttl-for-errors` to allow non-transient responses to be cached. When set to `true` error responses from hitting limits or bad data are cached for a short TTL. #9028 * [FEATURE] Query-frontend: add middleware to control access to specific PromQL experimental functions on a per-tenant basis. #9798 diff --git a/pkg/streamingpromql/engine_test.go b/pkg/streamingpromql/engine_test.go index 1741e779360..49c6098117f 100644 --- a/pkg/streamingpromql/engine_test.go +++ b/pkg/streamingpromql/engine_test.go @@ -904,6 +904,22 @@ func TestSubqueries(t *testing.T) { }, Start: time.Unix(10, 0), }, + { + // A query where SeriesMetadata returns some series but evaluates to no samples should not return anything. + Query: `(metric > Inf)[20s:10s]`, + Start: time.Unix(30, 0), + Result: promql.Result{ + Value: promql.Matrix{}, + }, + }, + { + // A nested subquery with the same properties as above. + Query: `last_over_time((metric > Inf)[20s:10s])[30s:5s]`, + Start: time.Unix(30, 0), + Result: promql.Result{ + Value: promql.Matrix{}, + }, + }, { Query: "metric[20s:5s]", Result: promql.Result{ diff --git a/pkg/streamingpromql/query.go b/pkg/streamingpromql/query.go index 89f8062b118..1f3868126ae 100644 --- a/pkg/streamingpromql/query.go +++ b/pkg/streamingpromql/query.go @@ -740,6 +740,12 @@ func (q *Query) populateMatrixFromRangeVectorOperator(ctx context.Context, o typ return nil, err } + if len(floats) == 0 && len(histograms) == 0 { + types.FPointSlicePool.Put(floats, q.memoryConsumptionTracker) + types.HPointSlicePool.Put(histograms, q.memoryConsumptionTracker) + continue + } + m = append(m, promql.Series{ Metric: s.Labels, Floats: floats, diff --git a/pkg/streamingpromql/testdata/ours/subqueries.test b/pkg/streamingpromql/testdata/ours/subqueries.test index 475acde23cb..fe454c43c05 100644 --- a/pkg/streamingpromql/testdata/ours/subqueries.test +++ b/pkg/streamingpromql/testdata/ours/subqueries.test @@ -127,3 +127,9 @@ eval range from 0 to 4m step 20s sum_over_time(sum_over_time(metric[2m:30s])[3m: eval range from 0 to 4m step 3m sum_over_time(sum_over_time(sum_over_time(metric[2m:30s])[3m:15s])[4m:20s]) {} 0 86 + +eval range from 0 to 4m step 15s last_over_time((metric > Inf)[20s:10s]) + # Should produce no results. + +eval instant at 3m last_over_time((metric > Inf)[20s:10s]) + # Should produce no results. From 12d7d793a3670435ae24bbd0830562f2335ebee5 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 25 Nov 2024 18:10:35 +0100 Subject: [PATCH 31/32] fix service dependencies --- .../config/mimir.yaml | 9 +++++- .../config/mimir_override.yaml | 8 ++++++ pkg/api/api.go | 11 ++++++-- pkg/costattribution/manager.go | 13 ++++++--- pkg/costattribution/manager_test.go | 28 +++++++++---------- pkg/costattribution/tracker.go | 12 ++------ pkg/distributor/distributor_test.go | 5 ++-- pkg/ingester/ingester_test.go | 3 +- pkg/mimir/modules.go | 25 ++++++++--------- 9 files changed, 64 insertions(+), 50 deletions(-) diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 5d245999115..09bc2c5a918 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -1,4 +1,6 @@ multitenancy_enabled: false +cost_attribution_registry_path: "/usage-metrics" +cost_attribution_eviction_interval: 10m distributor: ha_tracker: @@ -184,5 +186,10 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 + cost_attribution_labels: "instance" + max_cost_attribution_labels_per_user: 2 + max_cost_attribution_cardinality_per_user: 100 + cost_attribution_cooldown: 20m + runtime_config: - file: ./config/runtime.yaml + file: ./config/runtime.yaml \ No newline at end of file diff --git a/development/mimir-microservices-mode/config/mimir_override.yaml b/development/mimir-microservices-mode/config/mimir_override.yaml index c7c9e8fd2c5..176894eadd6 100644 --- a/development/mimir-microservices-mode/config/mimir_override.yaml +++ b/development/mimir-microservices-mode/config/mimir_override.yaml @@ -1,4 +1,6 @@ multitenancy_enabled: false +cost_attribution_registry_path: "/usage-metrics" +cost_attribution_eviction_interval: 10m distributor: pool: @@ -180,5 +182,11 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 + cost_attribution_labels: "instance" + max_cost_attribution_labels_per_user: 2 + max_cost_attribution_cardinality_per_user: 100 + cost_attribution_cooldown: 20m + runtime_config: file: ./config/runtime.yaml + diff --git a/pkg/api/api.go b/pkg/api/api.go index 4342a65bd52..286ea7c7e77 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -19,8 +19,6 @@ import ( "github.com/grafana/dskit/kv/memberlist" "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/server" - "github.com/prometheus/client_golang/prometheus" - "github.com/grafana/mimir/pkg/alertmanager" "github.com/grafana/mimir/pkg/alertmanager/alertmanagerpb" "github.com/grafana/mimir/pkg/compactor" @@ -43,6 +41,8 @@ import ( util_log "github.com/grafana/mimir/pkg/util/log" "github.com/grafana/mimir/pkg/util/validation" "github.com/grafana/mimir/pkg/util/validation/exporter" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" ) type ConfigHandler func(actualCfg interface{}, defaultCfg interface{}) http.HandlerFunc @@ -278,6 +278,13 @@ func (a *API) RegisterDistributor(d *distributor.Distributor, pushConfig distrib a.RegisterRoute("/distributor/ha_tracker", d.HATracker, false, true, "GET") } +// Function to register the usage metrics route +func (a *API) RegisterUsageMetricsRoute(customRegistryPath string, reg *prometheus.Registry) { + // Create a Prometheus HTTP handler for the custom registry + // Register the handler with the API's routing system + a.RegisterRoute(customRegistryPath, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), true, false, "GET") +} + // Ingester is defined as an interface to allow for alternative implementations // of ingesters to be passed into the API.RegisterIngester() method. type Ingester interface { diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index e6ac57d3aac..b21b269f3f5 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -29,21 +29,26 @@ type Manager struct { // mu protects the trackersByUserID map mtx sync.RWMutex trackersByUserID map[string]*Tracker + reg *prometheus.Registry } // NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series. // It will clean up inactive series and update the cost attribution of series every 3 minutes. -func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager { - s := &Manager{ +func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { + m := &Manager{ trackersByUserID: make(map[string]*Tracker), limits: limits, mtx: sync.RWMutex{}, inactiveTimeout: inactiveTimeout, logger: logger, + reg: reg, } - s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager") - return s + m.Service = services.NewTimerService(cleanupInterval, nil, m.iteration, nil).WithName("cost attribution manager") + if err := reg.Register(m); err != nil { + return nil, err + } + return m, nil } func (m *Manager) iteration(_ context.Context) error { diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 03fe7b679a4..00f53a77d90 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -12,7 +12,6 @@ import ( "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/grafana/mimir/pkg/util/validation" ) @@ -44,7 +43,12 @@ func getMockLimits(idx int) (*validation.Overrides, error) { func newTestManager() *Manager { logger := log.NewNopLogger() limits, _ := getMockLimits(0) - return NewManager(5*time.Second, 10*time.Second, logger, limits) + reg := prometheus.NewRegistry() + manager, err := NewManager(5*time.Second, 10*time.Second, logger, limits, reg) + if err != nil { + panic(err) + } + return manager } func Test_NewManager(t *testing.T) { @@ -63,8 +67,6 @@ func Test_EnabledForUser(t *testing.T) { func Test_CreateDeleteTracker(t *testing.T) { manager := newTestManager() - reg := prometheus.NewRegistry() - require.NoError(t, reg.Register(manager)) t.Run("Tracker existence and attributes", func(t *testing.T) { user1Tracker := manager.TrackerForUser("user1") @@ -92,7 +94,7 @@ func Test_CreateDeleteTracker(t *testing.T) { # TYPE cortex_received_attributed_samples_total counter cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="custom_attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) }) t.Run("Purge inactive attributions", func(t *testing.T) { @@ -102,7 +104,7 @@ func Test_CreateDeleteTracker(t *testing.T) { # TYPE cortex_discarded_attributed_samples_total counter cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="custom_attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) }) t.Run("Disabling user cost attribution", func(t *testing.T) { @@ -115,7 +117,7 @@ func Test_CreateDeleteTracker(t *testing.T) { # TYPE cortex_received_attributed_samples_total counter cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="custom_attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) }) t.Run("Updating user cardinality and labels", func(t *testing.T) { @@ -131,7 +133,7 @@ func Test_CreateDeleteTracker(t *testing.T) { # TYPE cortex_discarded_attributed_samples_total counter cortex_discarded_attributed_samples_total{feature="__missing__",reason="invalid-metrics-name",team="foo",tenant="user3",tracker="custom_attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) }) t.Run("When cost attribution get overflowed, all metrics are purged except overflow metrics", func(t *testing.T) { @@ -144,14 +146,12 @@ func Test_CreateDeleteTracker(t *testing.T) { # TYPE cortex_received_attributed_samples_total counter cortex_received_attributed_samples_total{feature="__overflow__",team="__overflow__",tenant="user3",tracker="custom_attribution"} 2 ` - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) }) } func Test_PurgeInactiveAttributionsUntil(t *testing.T) { manager := newTestManager() - reg := prometheus.NewRegistry() - require.NoError(t, reg.Register(manager)) // Simulate metrics for multiple users to set up initial state manager.TrackerForUser("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) @@ -174,7 +174,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { metricNames := []string{ "cortex_discarded_attributed_samples_total", } - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), metricNames...)) }) t.Run("Purge after inactive timeout", func(t *testing.T) { @@ -194,7 +194,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { metricNames := []string{ "cortex_discarded_attributed_samples_total", } - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), metricNames...)) }) t.Run("Purge all trackers", func(t *testing.T) { @@ -209,6 +209,6 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total", } - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(""), metricNames...)) }) } diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index c28ce2badd4..134fb48c9b6 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -127,11 +127,7 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { return } vals := t.getKeyValues(lbs, now.Unix()) - if t.isOverflow { - t.activeSeriesPerUserAttribution.WithLabelValues(overflowValue).Set(1) - } else { - t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() - } + t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() } func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { @@ -139,11 +135,7 @@ func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { return } vals := t.getKeyValues(lbs, now.Unix()) - if t.isOverflow { - t.activeSeriesPerUserAttribution.WithLabelValues(overflowValue).Set(1) - } else { - t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() - } + t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() } func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 2850bdef553..32fec0f55cf 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -2017,7 +2017,7 @@ func BenchmarkDistributor_Push(b *testing.B) { { state: "disabled", customRegistry: nil, - cfg: func(limits *validation.Limits) {}, + cfg: func(_ *validation.Limits) {}, }, } @@ -2074,8 +2074,7 @@ func BenchmarkDistributor_Push(b *testing.B) { // Initialize the cost attribution manager var cam *costattribution.Manager if caCase.customRegistry != nil { - cam = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides) - err := caCase.customRegistry.Register(cam) + cam, err = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) require.NoError(b, err) } diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index bafb17e68d2..10214b119e2 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -3645,8 +3645,7 @@ func BenchmarkIngesterPush(b *testing.B) { var cam *costattribution.Manager if caCase.customRegistry != nil { - cam = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides) - err = caCase.customRegistry.Register(cam) + cam, err = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) require.NoError(b, err) } diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 2ec2fa70323..20effe97d66 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -29,7 +29,6 @@ import ( "github.com/prometheus/alertmanager/featurecontrol" "github.com/prometheus/alertmanager/matchers/compat" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/common/config" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/rules" @@ -481,7 +480,6 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { func (t *Mimir) initDistributor() (serv services.Service, err error) { t.API.RegisterDistributor(t.Distributor, t.Cfg.Distributor, t.Registerer, t.Overrides) - return nil, nil } @@ -652,14 +650,11 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { func (t *Mimir) initCostAttributionService() (services.Service, error) { // The cost attribution service is only initilized if the custom registry path is provided. if t.Cfg.CostAttributionRegistryPath != "" { - t.CostAttributionManager = costattribution.NewManager(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides) - // if custom registry path is provided, create a custom registry and use it for cost attribution service - customRegistry := prometheus.NewRegistry() - // Register the custom registry with the provided URL. - // This allows users to expose custom metrics on a separate endpoint. - // This is useful when users want to expose metrics that are not part of the default Mimir metrics. - http.Handle(t.Cfg.CostAttributionRegistryPath, promhttp.HandlerFor(customRegistry, promhttp.HandlerOpts{Registry: customRegistry})) - err := customRegistry.Register(t.CostAttributionManager) + // If custom registry path is provided, create a custom registry and use it for cost attribution service only + reg := prometheus.NewRegistry() + var err error + t.CostAttributionManager, err = costattribution.NewManager(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) + t.API.RegisterUsageMetricsRoute(t.Cfg.CostAttributionRegistryPath, reg) return t.CostAttributionManager, err } return nil, nil @@ -696,6 +691,7 @@ func (t *Mimir) initIngester() (serv services.Service, err error) { ing = ingester.NewIngesterActivityTracker(t.Ingester, t.ActivityTracker) } t.API.RegisterIngester(ing) + return nil, nil } @@ -1197,10 +1193,11 @@ func (t *Mimir) setupModuleManager() error { IngesterPartitionRing: {MemberlistKV, IngesterRing, API}, Overrides: {RuntimeConfig}, OverridesExporter: {Overrides, MemberlistKV, Vault}, - Distributor: {DistributorService, API, ActiveGroupsCleanupService, CostAttributionService, Vault}, - DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault}, - Ingester: {IngesterService, API, ActiveGroupsCleanupService, CostAttributionService, Vault}, - IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV}, + Distributor: {DistributorService, API, ActiveGroupsCleanupService, Vault}, + DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault, CostAttributionService}, + CostAttributionService: {API, Overrides}, + Ingester: {IngesterService, API, ActiveGroupsCleanupService, Vault}, + IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV, CostAttributionService}, Flusher: {Overrides, API}, Queryable: {Overrides, DistributorService, IngesterRing, IngesterPartitionRing, API, StoreQueryable, MemberlistKV}, Querier: {TenantFederation, Vault}, From e0525e150904649007a5dd0aff9125964310729a Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 27 Nov 2024 21:41:18 +0100 Subject: [PATCH 32/32] fix the distributor crashloop --- .../config/mimir.yaml | 2 +- pkg/api/api.go | 5 +- pkg/costattribution/manager.go | 4 +- pkg/costattribution/tracker.go | 70 +++++++++++++++++-- pkg/ingester/activeseries/active_series.go | 14 ++-- 5 files changed, 77 insertions(+), 18 deletions(-) diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 09bc2c5a918..31702611891 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -186,7 +186,7 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 - cost_attribution_labels: "instance" + cost_attribution_labels: "container" max_cost_attribution_labels_per_user: 2 max_cost_attribution_cardinality_per_user: 100 cost_attribution_cooldown: 20m diff --git a/pkg/api/api.go b/pkg/api/api.go index 286ea7c7e77..3adc1adcaf9 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -19,6 +19,9 @@ import ( "github.com/grafana/dskit/kv/memberlist" "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/server" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/grafana/mimir/pkg/alertmanager" "github.com/grafana/mimir/pkg/alertmanager/alertmanagerpb" "github.com/grafana/mimir/pkg/compactor" @@ -41,8 +44,6 @@ import ( util_log "github.com/grafana/mimir/pkg/util/log" "github.com/grafana/mimir/pkg/util/validation" "github.com/grafana/mimir/pkg/util/validation/exporter" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" ) type ConfigHandler func(actualCfg interface{}, defaultCfg interface{}) http.HandlerFunc diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index b21b269f3f5..694d0008c73 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -79,7 +79,7 @@ func (m *Manager) TrackerForUser(userID string) *Tracker { // if not exists, create a new tracker if _, exists := m.trackersByUserID[userID]; !exists { - m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID)) + m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) } return m.trackersByUserID[userID] } @@ -180,7 +180,7 @@ func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64 // if they are different, we need to update the tracker, we don't mind, just reinitialized the tracker if !CompareCALabels(cat.CALabels(), newTrackedLabels) { m.mtx.Lock() - m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID)) + m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) // update the tracker with the new tracker cat = m.trackersByUserID[userID] m.mtx.Unlock() diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 134fb48c9b6..3f78d1f7265 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -4,9 +4,11 @@ package costattribution import ( "sort" + "strings" "sync" "time" + "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/model/labels" "go.uber.org/atomic" @@ -34,13 +36,17 @@ type Tracker struct { obseveredMtx sync.RWMutex observed map[uint64]*Observation + activeSerieMtx sync.RWMutex + activeSeriesAttributionMap map[string]*atomic.Int64 + hashBuffer []byte isOverflow bool cooldownUntil *atomic.Int64 cooldownDuration int64 + logger log.Logger } -func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration) (*Tracker, error) { +func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) (*Tracker, error) { // keep tracked labels sorted for consistent metric labels sort.Slice(trackedLabels, func(i, j int) bool { return trackedLabels[i] < trackedLabels[j] @@ -69,8 +75,11 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. Help: "The total number of active series per user and attribution.", ConstLabels: prometheus.Labels{TrackerLabel: "custom_attribution"}, }, append(trackedLabels, TenantLabel)), - hashBuffer: make([]byte, 0, 1024), - cooldownDuration: int64(cooldown.Seconds()), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: int64(cooldown.Seconds()), + logger: logger, + activeSerieMtx: sync.RWMutex{}, + activeSeriesAttributionMap: map[string]*atomic.Int64{}, } return m, nil } @@ -96,10 +105,25 @@ func (t *Tracker) CooldownDuration() int64 { return t.cooldownDuration } +// sep is used to separate the labels in the key, it is not a valid label caracter +const sep = rune(0x80) + func (t *Tracker) cleanupTrackerAttribution(vals []string) { if t == nil { return } + + var sb strings.Builder + for i, v := range vals { + if i > 0 { + sb.WriteRune(sep) + } + sb.WriteString(v) + } + t.activeSerieMtx.Lock() + delete(t.activeSeriesAttributionMap, sb.String()) + t.activeSerieMtx.Unlock() + t.activeSeriesPerUserAttribution.DeleteLabelValues(vals...) t.receivedSamplesAttribution.DeleteLabelValues(vals...) @@ -116,6 +140,9 @@ func (t *Tracker) cleanupTracker(userID string) { if t == nil { return } + t.activeSerieMtx.Lock() + t.activeSeriesAttributionMap = map[string]*atomic.Int64{} + t.activeSerieMtx.Unlock() filter := prometheus.Labels{TenantLabel: userID} t.activeSeriesPerUserAttribution.DeletePartialMatch(filter) t.receivedSamplesAttribution.DeletePartialMatch(filter) @@ -127,7 +154,20 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { return } vals := t.getKeyValues(lbs, now.Unix()) - t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Inc() + var sb strings.Builder + for i, v := range vals { + if i > 0 { + sb.WriteRune(sep) + } + sb.WriteString(v) + } + t.activeSerieMtx.Lock() + if cnt, ok := t.activeSeriesAttributionMap[sb.String()]; !ok { + t.activeSeriesAttributionMap[sb.String()] = atomic.NewInt64(1) + } else { + cnt.Inc() + } + t.activeSerieMtx.Unlock() } func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { @@ -135,7 +175,18 @@ func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { return } vals := t.getKeyValues(lbs, now.Unix()) - t.activeSeriesPerUserAttribution.WithLabelValues(vals...).Dec() + var sb strings.Builder + for i, v := range vals { + if i > 0 { + sb.WriteRune(sep) + } + sb.WriteString(v) + } + t.activeSerieMtx.Lock() + if cnt, ok := t.activeSeriesAttributionMap[sb.String()]; ok { + cnt.Dec() + } + t.activeSerieMtx.Unlock() } func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { @@ -163,6 +214,13 @@ func (t *Tracker) Collect(out chan<- prometheus.Metric) { if t == nil { return } + t.activeSerieMtx.Lock() + for key, c := range t.activeSeriesAttributionMap { + if c != nil { + t.activeSeriesPerUserAttribution.WithLabelValues(strings.Split(key, string(sep))...).Set(float64(c.Load())) + } + } + t.activeSerieMtx.Unlock() t.activeSeriesPerUserAttribution.Collect(out) t.receivedSamplesAttribution.Collect(out) t.discardedSampleAttribution.Collect(out) @@ -205,6 +263,7 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { return false } + t.obseveredMtx.Lock() // we store up to 2 * maxCardinality observations, if we have seen the stream before, we update the last update time if o, known := t.observed[stream]; known && o.lastUpdate != nil && o.lastUpdate.Load() < ts { o.lastUpdate.Store(ts) @@ -214,6 +273,7 @@ func (t *Tracker) overflow(stream uint64, values []string, ts int64) bool { lastUpdate: atomic.NewInt64(ts), } } + t.obseveredMtx.Unlock() // If the maximum cardinality is hit all streams become `__overflow__`, the function would return true. // the origin labels ovserved time is not updated, but the overflow hash is updated. diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index d8763053647..e7895404a22 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -76,7 +76,6 @@ type seriesStripe struct { activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. - buf labels.ScratchBuilder } // seriesEntry holds a timestamp for single series. @@ -451,7 +450,6 @@ func (s *seriesStripe) reinitialize( s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) s.cat = cat - s.buf = labels.NewScratchBuilder(128) } func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { @@ -472,6 +470,7 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(s.activeMatchingNativeHistogramBuckets), s.activeMatchingNativeHistogramBuckets) oldest := int64(math.MaxInt64) + buf := labels.NewScratchBuilder(128) for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { @@ -480,12 +479,11 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { } if idx != nil { - if err := idx.Series(ref, &s.buf, nil); err != nil { + if err := idx.Series(ref, &buf, nil); err != nil { //TODO: think about what to do here _ = err } - s.cat.DecrementActiveSeries(s.buf.Labels(), keepUntil) - s.buf.Reset() + s.cat.DecrementActiveSeries(buf.Labels(), keepUntil) } delete(s.refs, ref) continue @@ -535,12 +533,12 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { s.active-- if idx != nil { - if err := idx.Series(ref, &s.buf, nil); err != nil { + buf := labels.NewScratchBuilder(10) + if err := idx.Series(ref, &buf, nil); err != nil { //TODO: think about what to do here _ = err } - s.cat.DecrementActiveSeries(s.buf.Labels(), time.Now()) - defer s.buf.Reset() + s.cat.DecrementActiveSeries(buf.Labels(), time.Now()) } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms--