kubernetes-sigs · KPostOffice · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/apis/config/v1beta1/configuration_types.go b/apis/config/v1beta1/configuration_types.go
@@ -146,6 +146,9 @@ type ControllerMetrics struct {
 	// metrics will be reported.
 	// +optional
 	EnableClusterQueueResources bool `json:"enableClusterQueueResources,omitempty"`
+
+	// +optional
+	EnableLocalQueueMetrics bool `json:"enableLocalQueueMetrics,omitempty"`
 }
 
 // ControllerHealth defines the health configs.

diff --git a/cmd/kueue/main.go b/cmd/kueue/main.go
@@ -131,6 +131,10 @@ func main() {
 
 	metrics.Register()
 
+	if features.Enabled(features.LocalQueueMetrics) {
+		metrics.RegisterLQMetrics()
+	}
+
 	kubeConfig := ctrl.GetConfigOrDie()
 	if kubeConfig.UserAgent == "" {
 		kubeConfig.UserAgent = useragent.Default()

diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go
@@ -431,6 +431,11 @@ func (c *Cache) DeleteClusterQueue(cq *kueue.ClusterQueue) {
 	if !ok {
 		return
 	}
+	if features.Enabled(features.LocalQueueMetrics) {
+		for _, q := range c.hm.ClusterQueues[cq.Name].localQueues {
+			metrics.ClearLocalQueueCacheMetrics(metrics.LQRefFromLocalQueueKey(q.key))
+		}
+	}
 	c.hm.DeleteClusterQueue(cq.Name)
 	metrics.ClearCacheMetrics(cq.Name)
 }

diff --git a/pkg/cache/clusterqueue.go b/pkg/cache/clusterqueue.go
@@ -237,6 +237,11 @@ func (c *clusterQueue) updateQueueStatus() {
 	if status != c.Status {
 		c.Status = status
 		metrics.ReportClusterQueueStatus(c.Name, c.Status)
+		if features.Enabled(features.LocalQueueMetrics) {
+			for _, lq := range c.localQueues {
+				metrics.ReportLocalQueueStatus(metrics.LQRefFromLocalQueueKey(lq.key), c.Status)
+			}
+		}
 	}
 }
 
@@ -500,6 +505,12 @@ func (c *clusterQueue) reportActiveWorkloads() {
 	metrics.ReservingActiveWorkloads.WithLabelValues(c.Name).Set(float64(len(c.Workloads)))
 }
 
+func (q *queue) reportActiveWorkloads() {
+	qKeySlice := strings.Split(q.key, "/")
+	metrics.LocalQueueAdmittedActiveWorkloads.WithLabelValues(qKeySlice[1], qKeySlice[0]).Set(float64(q.admittedWorkloads))
+	metrics.LocalQueueReservingActiveWorkloads.WithLabelValues(qKeySlice[1], qKeySlice[0]).Set(float64(q.reservingWorkloads))
+}
+
 // updateWorkloadUsage updates the usage of the ClusterQueue for the workload
 // and the number of admitted workloads for local queues.
 func (c *clusterQueue) updateWorkloadUsage(wi *workload.Info, m int64) {
@@ -537,6 +548,9 @@ func (c *clusterQueue) updateWorkloadUsage(wi *workload.Info, m int64) {
 			updateFlavorUsage(frUsage, lq.admittedUsage, m)
 			lq.admittedWorkloads += int(m)
 		}
+		if features.Enabled(features.LocalQueueMetrics) {
+			lq.reportActiveWorkloads()
+		}
 	}
 }
 
@@ -581,11 +595,18 @@ func (c *clusterQueue) addLocalQueue(q *kueue.LocalQueue) error {
 		}
 	}
 	c.localQueues[qKey] = qImpl
+	if features.Enabled(features.LocalQueueMetrics) {
+		qImpl.reportActiveWorkloads()
+		metrics.ReportLocalQueueStatus(metrics.LQRefFromLocalQueueKey(qKey), c.Status)
+	}
 	return nil
 }
 
 func (c *clusterQueue) deleteLocalQueue(q *kueue.LocalQueue) {
 	qKey := queueKey(q)
+	if features.Enabled(features.LocalQueueMetrics) {
+		metrics.ClearLocalQueueCacheMetrics(metrics.LQRefFromLocalQueueKey(qKey))
+	}
 	delete(c.localQueues, qKey)
 }
 

diff --git a/pkg/controller/core/core.go b/pkg/controller/core/core.go
@@ -58,7 +58,6 @@ func SetupControllers(mgr ctrl.Manager, qManager *queue.Manager, cc *cache.Cache
 		cc,
 		WithQueueVisibilityUpdateInterval(queueVisibilityUpdateInterval(cfg)),
 		WithQueueVisibilityClusterQueuesMaxCount(queueVisibilityClusterQueuesMaxCount(cfg)),
-		WithReportResourceMetrics(cfg.Metrics.EnableClusterQueueResources),
 		WithFairSharing(fairSharingEnabled),
 		WithWatchers(rfRec, acRec),
 	)

diff --git a/pkg/controller/core/localqueue_controller.go b/pkg/controller/core/localqueue_controller.go
@@ -40,7 +40,10 @@ import (
 	"sigs.k8s.io/kueue/pkg/cache"
 	"sigs.k8s.io/kueue/pkg/constants"
 	"sigs.k8s.io/kueue/pkg/controller/core/indexer"
+	"sigs.k8s.io/kueue/pkg/features"
+	"sigs.k8s.io/kueue/pkg/metrics"
 	"sigs.k8s.io/kueue/pkg/queue"
+	"sigs.k8s.io/kueue/pkg/util/resource"
 )
 
 const (
@@ -63,7 +66,24 @@ type LocalQueueReconciler struct {
 	wlUpdateCh chan event.GenericEvent
 }
 
-func NewLocalQueueReconciler(client client.Client, queues *queue.Manager, cache *cache.Cache) *LocalQueueReconciler {
+type LocalQueueReconcilerOptions struct {
+	LocalQueueMetricsEnabled bool
+}
+
+type LocalQueueReconcilerOption func(*LocalQueueReconcilerOptions)
+
+var defaultLQOptions = LocalQueueReconcilerOptions{}
+
+func NewLocalQueueReconciler(
+	client client.Client,
+	queues *queue.Manager,
+	cache *cache.Cache,
+	opts ...LocalQueueReconcilerOption,
+) *LocalQueueReconciler {
+	options := defaultLQOptions
+	for _, opt := range opts {
+		opt(&options)
+	}
 	return &LocalQueueReconciler{
 		log:        ctrl.Log.WithName("localqueue-reconciler"),
 		queues:     queues,
@@ -142,6 +162,10 @@ func (r *LocalQueueReconciler) Create(e event.CreateEvent) bool {
 		log.Error(err, "Failed to add localQueue to the cache")
 	}
 
+	if features.Enabled(features.LocalQueueMetrics) {
+		recordLocalQueueUsageMetrics(q)
+	}
+
 	return true
 }
 
@@ -151,6 +175,11 @@ func (r *LocalQueueReconciler) Delete(e event.DeleteEvent) bool {
 		// No need to interact with the queue manager for other objects.
 		return true
 	}
+
+	if features.Enabled(features.LocalQueueMetrics) {
+		metrics.ClearLocalQueueResourceMetrics(localQueueReferenceFromLocalQueue(q))
+	}
+
 	r.log.V(2).Info("LocalQueue delete event", "localQueue", klog.KObj(q))
 	r.queues.DeleteLocalQueue(q)
 	r.cache.DeleteLocalQueue(q)
@@ -191,10 +220,40 @@ func (r *LocalQueueReconciler) Update(e event.UpdateEvent) bool {
 	}
 
 	r.queues.DeleteLocalQueue(oldLq)
+	if features.Enabled(features.LocalQueueMetrics) {
+		updateLocalQueueResourceMetrics(newLq)
+	}
 
 	return true
 }
 
+func localQueueReferenceFromLocalQueue(lq *kueue.LocalQueue) metrics.LocalQueueReference {
+	return metrics.LocalQueueReference{
+		Name:      lq.Name,
+		Namespace: lq.Namespace,
+	}
+}
+
+func recordLocalQueueUsageMetrics(queue *kueue.LocalQueue) {
+	for _, flavor := range queue.Status.FlavorUsage {
+		for _, r := range flavor.Resources {
+			metrics.ReportLocalQueueResourceUsage(localQueueReferenceFromLocalQueue(queue), string(flavor.Name), string(r.Name), resource.QuantityToFloat(&r.Total))
+		}
+	}
+
+	for _, flavor := range queue.Status.FlavorsReservation {
+		for _, r := range flavor.Resources {
+			metrics.ReportLocalQueueResourceReservations(localQueueReferenceFromLocalQueue(queue), string(flavor.Name), string(r.Name), resource.QuantityToFloat(&r.Total))
+		}
+	}
+
+}
+
+func updateLocalQueueResourceMetrics(queue *kueue.LocalQueue) {
+	metrics.ClearLocalQueueResourceMetrics(localQueueReferenceFromLocalQueue(queue))
+	recordLocalQueueUsageMetrics(queue)
+}
+
 func (r *LocalQueueReconciler) Generic(e event.GenericEvent) bool {
 	r.log.V(3).Info("Got Workload event", "workload", klog.KObj(e.Object))
 	return true

diff --git a/pkg/controller/core/workload_controller.go b/pkg/controller/core/workload_controller.go
@@ -48,6 +48,7 @@ import (
 	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
 	"sigs.k8s.io/kueue/pkg/cache"
 	"sigs.k8s.io/kueue/pkg/controller/core/indexer"
+	"sigs.k8s.io/kueue/pkg/features"
 	"sigs.k8s.io/kueue/pkg/metrics"
 	"sigs.k8s.io/kueue/pkg/queue"
 	utilac "sigs.k8s.io/kueue/pkg/util/admissioncheck"
@@ -258,6 +259,10 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
 			r.recorder.Eventf(&wl, corev1.EventTypeNormal, "Admitted", "Admitted by ClusterQueue %v, wait time since reservation was %.0fs", wl.Status.Admission.ClusterQueue, quotaReservedWaitTime.Seconds())
 			metrics.AdmittedWorkload(kueue.ClusterQueueReference(cqName), queuedWaitTime)
 			metrics.AdmissionChecksWaitTime(kueue.ClusterQueueReference(cqName), quotaReservedWaitTime)
+			if features.Enabled(features.LocalQueueMetrics) {
+				metrics.LocalQueueAdmittedWorkload(metrics.LQRefFromWorkload(&wl), queuedWaitTime)
+				metrics.LocalQueueAdmissionChecksWaitTime(metrics.LQRefFromWorkload(&wl), quotaReservedWaitTime)
+			}
 		}
 		return ctrl.Result{}, nil
 	}
@@ -428,6 +433,9 @@ func (r *WorkloadReconciler) reconcileOnLocalQueueActiveState(ctx context.Contex
 			cqName := string(lq.Spec.ClusterQueue)
 			if slices.Contains(r.queues.GetClusterQueueNames(), cqName) {
 				metrics.ReportEvictedWorkloads(cqName, kueue.WorkloadEvictedByLocalQueueStopped)
+				if features.Enabled(features.LocalQueueMetrics) {
+					metrics.ReportLocalQueueEvictedWorkloads(metrics.LQRefFromWorkload(wl), kueue.WorkloadEvictedByLocalQueueStopped)
+				}
 			}
 		}
 		return true, client.IgnoreNotFound(err)

diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go
@@ -151,6 +151,12 @@ const (
 	//
 	// Workloads keeps allocated quota and preserves QuotaReserved=True when ProvisioningRequest fails
 	KeepQuotaForProvReqRetry featuregate.Feature = "KeepQuotaForProvReqRetry"
+
+	// owner: @kpostoffice
+	// alpha: v0.10
+	//
+	// Enabled gathering of LocalQueue metrics
+	LocalQueueMetrics featuregate.Feature = "LocalQueueMetrics"
 )
 
 func init() {
@@ -180,6 +186,7 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
 	ExposeFlavorsInLocalQueue:           {Default: true, PreRelease: featuregate.Beta},
 	AdmissionCheckValidationRules:       {Default: false, PreRelease: featuregate.Deprecated},
 	KeepQuotaForProvReqRetry:            {Default: false, PreRelease: featuregate.Deprecated},
+	LocalQueueMetrics:                   {Default: false, PreRelease: featuregate.Alpha},
 }
 
 func SetFeatureGateDuringTest(tb testing.TB, f featuregate.Feature, value bool) {