This adds a new set of metrics for the selector cache. Both in terms of cardinality,
but also around the performance of various operations.
Contention in the selector cache can easily make regenerations slower,
and potentially grind the whole agent to a halt in special cases. This
allows us to use these metrics to better observe the performance of
these operations.
Signed-off-by: Odin Ugedal <odin@ugedal.com>
Signed-off-by: Odin Ugedal <ougedal@palantir.com>
[]string{types.LabelSelectorClass},
nil,
)
+
+ selectorCacheSelectorCount = prometheus.NewDesc(
+ prometheus.BuildFQName(metrics.CiliumAgentNamespace, "policy_selector_cache", "selectors"),
+ "The number of selectors in the selector cache",
+ []string{metrics.LabelType},
+ nil,
+ )
+
+ selectorCacheIdentityCount = prometheus.NewDesc(
+ prometheus.BuildFQName(metrics.CiliumAgentNamespace, "policy_selector_cache", "identities"),
+ "The number of identities in the selector cache",
+ []string{metrics.LabelType},
+ nil,
+ )
+
+ selectorCacheOperationDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+ Namespace: metrics.CiliumAgentNamespace,
+ Subsystem: "policy_selector_cache",
+ Name: "operation_duration_seconds",
+ Help: "The latency of selector cache operations",
+ Buckets: []float64{0.0005, 0.001, 0.005, 0.025, 0.05, 0.1, 0.2, 0.4},
+ }, []string{metrics.LabelOperation, metrics.LabelScope, metrics.LabelType})
)
type selectorStats struct {
maxCardinalityByClass map[string]int
+ selectors int
+ identities int
}
func newSelectorStats() selectorStats {
}
func newSelectorCacheMetrics(sc selectorStatsCollector) prometheus.Collector {
- return &selectorCacheMetrics{selectorStatsCollector: sc}
+ return &selectorCacheMetrics{
+ selectorStatsCollector: sc,
+ }
}
func (scm *selectorCacheMetrics) Describe(ch chan<- *prometheus.Desc) {
ch <- selectorCacheMetricsDesc
+ ch <- selectorCacheSelectorCount
+ ch <- selectorCacheIdentityCount
}
func (scm *selectorCacheMetrics) Collect(ch chan<- prometheus.Metric) {
selectorCacheMetricsDesc, prometheus.GaugeValue, float64(stat), class,
)
}
+
+ ch <- prometheus.MustNewConstMetric(
+ selectorCacheSelectorCount, prometheus.GaugeValue, float64(stats.selectors), types.LabelValueSCTypePeer)
+ ch <- prometheus.MustNewConstMetric(
+ selectorCacheIdentityCount, prometheus.GaugeValue, float64(stats.identities), types.LabelValueSCTypePeer)
}
"github.com/cilium/cilium/pkg/metrics"
"github.com/cilium/cilium/pkg/policy/api"
"github.com/cilium/cilium/pkg/policy/types"
+ "github.com/cilium/cilium/pkg/time"
)
var (
return idCache
}
+func (c *scIdentityCache) Len() int {
+ return len(c.ids)
+}
+
func (c *scIdentityCache) insert(nid identity.NumericIdentity, lbls labels.LabelArray) *scIdentity {
namespace, _ := lbls.LookupLabel(&podNamespaceLabel)
id := &scIdentity{
result.maxCardinalityByClass[class] = len(selections)
}
}
+ result.selectors = sc.selectors.Len()
+ result.identities = sc.idCache.Len()
return result
}
if err := metrics.Register(newSelectorCacheMetrics(sc)); err != nil {
sc.logger.Warn("Selector cache metrics registration failed. No metrics will be reported.", logfields.Error, err)
}
+
+ if err := metrics.Register(selectorCacheOperationDuration); err != nil {
+ sc.logger.Warn("Selector cache metrics registration failed. No metrics will be reported.", logfields.Error, err)
+ }
}
// SetLocalIdentityNotifier injects the provided identityNotifier into the
added := false
for i, selector := range selectors {
// Check if the selector has already been cached
+ operationStart := time.Now()
key := selector.Key()
sel, exists := sc.selectors.Get(key)
if !exists {
if sel.addUser(user, sc.localIdentityNotifier) {
added = true
}
-
css[i] = sel
+
+ if !exists {
+ selectorCacheOperationDuration.WithLabelValues(types.LabelValueSCOperationAddSelector, types.LabelValueSCOperation, types.LabelValueSCTypePeer).Observe(time.Since(operationStart).Seconds())
+ }
+
}
return css, added
}
// lock must be held
func (sc *SelectorCache) removeSelectorLocked(selector CachedSelector, user CachedSelectionUser) {
+ start := time.Now()
key := selector.String()
sel, exists := sc.selectors.Get(key)
if exists && sel.removeUser(user, sc.localIdentityNotifier) {
sc.selectors.Delete(sel)
sel.updateSelections()
+ selectorCacheOperationDuration.WithLabelValues(types.LabelValueSCOperationRemoveSelector, types.LabelValueSCOperation, types.LabelValueSCTypePeer).Observe(time.Since(start).Seconds())
}
}
// identities are matched against selectors that have no namespace requirements.
namespaces := map[string]identity.NumericIdentitySlice{"": {}}
+ start := time.Now()
sc.mutex.Lock()
defer sc.mutex.Unlock()
+ operationStart := time.Now()
+ defer func() {
+ selectorCacheOperationDuration.WithLabelValues(types.LabelValueSCOperationIdentityUpdates, types.LabelValueSCOperationLock, types.LabelValueSCTypePeer).Observe(operationStart.Sub(start).Seconds())
+ selectorCacheOperationDuration.WithLabelValues(types.LabelValueSCOperationIdentityUpdates, types.LabelValueSCOperation, types.LabelValueSCTypePeer).Observe(time.Since(operationStart).Seconds())
+ }()
nextRev := sc.revision + 1
// LabelValueSCOther is used for security identities allocated locally
// on the current node.
LabelValueSCOther = "other"
+
+ // LabelValueSCTypePeer is used for the normal selector cache
+ LabelValueSCTypePeer = "peer"
+
+ // LabelValueSCOperationAddSelector is used for the operation that adds a new selector
+ LabelValueSCOperationAddSelector = "add_selector"
+
+ // LabelValueSCOperationRemoveSelector is used for the operation that removes a selector
+ LabelValueSCOperationRemoveSelector = "remove_selector"
+
+ // LabelValueSCOperationIdentityUpdates is used for the operation that updates one or more identities in the cache
+ LabelValueSCOperationIdentityUpdates = "identity_updates"
+
+ // LabelValueSCOperation is used for the actual Selector Cache Operation duration
+ LabelValueSCOperation = "operation"
+
+ // LabelValueSCOperationLock is used for the actual lock time during the Selector Cache Operation duration
+ LabelValueSCOperationLock = "lock"
)
type PolicyMetrics interface {