]> git.feebdaed.xyz Git - 0xmirror/cilium.git/commitdiff
policy: add selector cache metrics
authorOdin Ugedal <ougedal@palantir.com>
Wed, 19 Nov 2025 11:45:15 +0000 (12:45 +0100)
committerAndré Martins <aanm@users.noreply.github.com>
Thu, 18 Dec 2025 04:14:42 +0000 (04:14 +0000)
This adds a new set of metrics for the selector cache. Both in terms of cardinality,
but also around the performance of various operations.

Contention in the selector cache can easily make regenerations slower,
and potentially grind the whole agent to a halt in special cases. This
allows us to use these metrics to better observe the performance of
these operations.

Signed-off-by: Odin Ugedal <odin@ugedal.com>
Signed-off-by: Odin Ugedal <ougedal@palantir.com>
pkg/policy/metrics.go
pkg/policy/selectorcache.go
pkg/policy/types/metrics.go

index c5c8487faf9afc2db25e4ef6725e979af9f19f3d..b68d867ead0d7396bb8f3e48af6b8ba88d6d5ace 100644 (file)
@@ -17,10 +17,34 @@ var (
                []string{types.LabelSelectorClass},
                nil,
        )
+
+       selectorCacheSelectorCount = prometheus.NewDesc(
+               prometheus.BuildFQName(metrics.CiliumAgentNamespace, "policy_selector_cache", "selectors"),
+               "The number of selectors in the selector cache",
+               []string{metrics.LabelType},
+               nil,
+       )
+
+       selectorCacheIdentityCount = prometheus.NewDesc(
+               prometheus.BuildFQName(metrics.CiliumAgentNamespace, "policy_selector_cache", "identities"),
+               "The number of identities in the selector cache",
+               []string{metrics.LabelType},
+               nil,
+       )
+
+       selectorCacheOperationDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+               Namespace: metrics.CiliumAgentNamespace,
+               Subsystem: "policy_selector_cache",
+               Name:      "operation_duration_seconds",
+               Help:      "The latency of selector cache operations",
+               Buckets:   []float64{0.0005, 0.001, 0.005, 0.025, 0.05, 0.1, 0.2, 0.4},
+       }, []string{metrics.LabelOperation, metrics.LabelScope, metrics.LabelType})
 )
 
 type selectorStats struct {
        maxCardinalityByClass map[string]int
+       selectors             int
+       identities            int
 }
 
 func newSelectorStats() selectorStats {
@@ -44,11 +68,15 @@ type selectorCacheMetrics struct {
 }
 
 func newSelectorCacheMetrics(sc selectorStatsCollector) prometheus.Collector {
-       return &selectorCacheMetrics{selectorStatsCollector: sc}
+       return &selectorCacheMetrics{
+               selectorStatsCollector: sc,
+       }
 }
 
 func (scm *selectorCacheMetrics) Describe(ch chan<- *prometheus.Desc) {
        ch <- selectorCacheMetricsDesc
+       ch <- selectorCacheSelectorCount
+       ch <- selectorCacheIdentityCount
 }
 
 func (scm *selectorCacheMetrics) Collect(ch chan<- prometheus.Metric) {
@@ -59,4 +87,9 @@ func (scm *selectorCacheMetrics) Collect(ch chan<- prometheus.Metric) {
                        selectorCacheMetricsDesc, prometheus.GaugeValue, float64(stat), class,
                )
        }
+
+       ch <- prometheus.MustNewConstMetric(
+               selectorCacheSelectorCount, prometheus.GaugeValue, float64(stats.selectors), types.LabelValueSCTypePeer)
+       ch <- prometheus.MustNewConstMetric(
+               selectorCacheIdentityCount, prometheus.GaugeValue, float64(stats.identities), types.LabelValueSCTypePeer)
 }
index af08a6a04ed5d5ac020f9ac7d1856e6c95c9f65c..3aa81d33fe1d531881699571ece28836e8fa929e 100644 (file)
@@ -18,6 +18,7 @@ import (
        "github.com/cilium/cilium/pkg/metrics"
        "github.com/cilium/cilium/pkg/policy/api"
        "github.com/cilium/cilium/pkg/policy/types"
+       "github.com/cilium/cilium/pkg/time"
 )
 
 var (
@@ -50,6 +51,10 @@ func newScIdentityCache(ids identity.IdentityMap) scIdentityCache {
        return idCache
 }
 
+func (c *scIdentityCache) Len() int {
+       return len(c.ids)
+}
+
 func (c *scIdentityCache) insert(nid identity.NumericIdentity, lbls labels.LabelArray) *scIdentity {
        namespace, _ := lbls.LookupLabel(&podNamespaceLabel)
        id := &scIdentity{
@@ -343,6 +348,8 @@ func (sc *SelectorCache) Stats() selectorStats {
                        result.maxCardinalityByClass[class] = len(selections)
                }
        }
+       result.selectors = sc.selectors.Len()
+       result.identities = sc.idCache.Len()
 
        return result
 }
@@ -444,6 +451,10 @@ func (sc *SelectorCache) RegisterMetrics() {
        if err := metrics.Register(newSelectorCacheMetrics(sc)); err != nil {
                sc.logger.Warn("Selector cache metrics registration failed. No metrics will be reported.", logfields.Error, err)
        }
+
+       if err := metrics.Register(selectorCacheOperationDuration); err != nil {
+               sc.logger.Warn("Selector cache metrics registration failed. No metrics will be reported.", logfields.Error, err)
+       }
 }
 
 // SetLocalIdentityNotifier injects the provided identityNotifier into the
@@ -552,6 +563,7 @@ func (sc *SelectorCache) addSelectorsTxn(user CachedSelectionUser, lbls stringLa
        added := false
        for i, selector := range selectors {
                // Check if the selector has already been cached
+               operationStart := time.Now()
                key := selector.Key()
                sel, exists := sc.selectors.Get(key)
                if !exists {
@@ -562,8 +574,12 @@ func (sc *SelectorCache) addSelectorsTxn(user CachedSelectionUser, lbls stringLa
                if sel.addUser(user, sc.localIdentityNotifier) {
                        added = true
                }
-
                css[i] = sel
+
+               if !exists {
+                       selectorCacheOperationDuration.WithLabelValues(types.LabelValueSCOperationAddSelector, types.LabelValueSCOperation, types.LabelValueSCTypePeer).Observe(time.Since(operationStart).Seconds())
+               }
+
        }
        return css, added
 }
@@ -615,11 +631,13 @@ func (sc *SelectorCache) AddIdentitySelectorForTest(user CachedSelectionUser, lb
 
 // lock must be held
 func (sc *SelectorCache) removeSelectorLocked(selector CachedSelector, user CachedSelectionUser) {
+       start := time.Now()
        key := selector.String()
        sel, exists := sc.selectors.Get(key)
        if exists && sel.removeUser(user, sc.localIdentityNotifier) {
                sc.selectors.Delete(sel)
                sel.updateSelections()
+               selectorCacheOperationDuration.WithLabelValues(types.LabelValueSCOperationRemoveSelector, types.LabelValueSCOperation, types.LabelValueSCTypePeer).Observe(time.Since(start).Seconds())
        }
 }
 
@@ -745,8 +763,14 @@ func (sc *SelectorCache) UpdateIdentities(added, deleted identity.IdentityMap, w
        // identities are matched against selectors that have no namespace requirements.
        namespaces := map[string]identity.NumericIdentitySlice{"": {}}
 
+       start := time.Now()
        sc.mutex.Lock()
        defer sc.mutex.Unlock()
+       operationStart := time.Now()
+       defer func() {
+               selectorCacheOperationDuration.WithLabelValues(types.LabelValueSCOperationIdentityUpdates, types.LabelValueSCOperationLock, types.LabelValueSCTypePeer).Observe(operationStart.Sub(start).Seconds())
+               selectorCacheOperationDuration.WithLabelValues(types.LabelValueSCOperationIdentityUpdates, types.LabelValueSCOperation, types.LabelValueSCTypePeer).Observe(time.Since(operationStart).Seconds())
+       }()
 
        nextRev := sc.revision + 1
 
index 5724dc1c3df562da785868ab46f81c373b1740e4..f26cbb37be9faf1e817a281f8efb48e5307e1d45 100644 (file)
@@ -20,6 +20,24 @@ const (
        // LabelValueSCOther is used for security identities allocated locally
        // on the current node.
        LabelValueSCOther = "other"
+
+       // LabelValueSCTypePeer is used for the normal selector cache
+       LabelValueSCTypePeer = "peer"
+
+       // LabelValueSCOperationAddSelector is used for the operation that adds a new selector
+       LabelValueSCOperationAddSelector = "add_selector"
+
+       // LabelValueSCOperationRemoveSelector is used for the operation that removes a selector
+       LabelValueSCOperationRemoveSelector = "remove_selector"
+
+       // LabelValueSCOperationIdentityUpdates is used for the operation that updates one or more identities in the cache
+       LabelValueSCOperationIdentityUpdates = "identity_updates"
+
+       // LabelValueSCOperation is used for the actual Selector Cache Operation duration
+       LabelValueSCOperation = "operation"
+
+       // LabelValueSCOperationLock is used for the actual lock time during the Selector Cache Operation duration
+       LabelValueSCOperationLock = "lock"
 )
 
 type PolicyMetrics interface {