Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reworks the prometheus metrics to adhere to best practices #5174

Closed
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ Here is an overview of all new **experimental** features:
### Improvements

- **General**: Add parameter queryParameters to prometheus-scaler ([#4962](https://github.com/kedacore/keda/issues/4962))
- **General**: Renamed Prometheus metrics to include units and `total` where approriate ([#4854](https://github.com/kedacore/keda/issues/4854))
- **General**: Support TriggerAuthentication properties from ConfigMap ([#4830](https://github.com/kedacore/keda/issues/4830))
- **Hashicorp Vault**: Add support to get secret that needs write operation (e.g. pki) ([#5067](https://github.com/kedacore/keda/issues/5067))
- **Kafka Scaler**: Ability to set upper bound to the number of partitions with lag ([#3997](https://github.com/kedacore/keda/issues/3997))
Expand All @@ -82,6 +83,7 @@ You can find all deprecations in [this overview](https://github.com/kedacore/ked
New deprecation(s):

- Remove support for Azure AD Pod Identity-based authentication ([#5035](https://github.com/kedacore/keda/issues/5035))
- Various Prometheus metrics have been renamed to follow the preferred naming conventions. The old ones are still available, but will be removed in the future ([#4854](https://github.com/kedacore/keda/issues/4854)).

### Breaking Changes

Expand Down
6 changes: 3 additions & 3 deletions config/grafana/keda-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@
"uid": "${datasource}"
},
"editorMode": "code",
"expr": "sum by(job) (rate(keda_scaler_errors{}[5m]))",
"expr": "sum by(job) (rate(keda_scaler_errors_total{}[5m]))",
"legendFormat": "{{ job }}",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -313,7 +313,7 @@
"uid": "${datasource}"
},
"editorMode": "code",
"expr": "sum by(scaler) (rate(keda_scaler_errors{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\", scaler=~\"$scaler\"}[5m]))",
"expr": "sum by(scaler) (rate(keda_scaler_errors_total{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\", scaler=~\"$scaler\"}[5m]))",
"legendFormat": "{{ scaler }}",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -423,7 +423,7 @@
"uid": "${datasource}"
},
"editorMode": "code",
"expr": "sum by(scaledObject) (rate(keda_scaled_object_errors{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\"}[5m]))",
"expr": "sum by(scaledObject) (rate(keda_scaled_object_errors_total{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\"}[5m]))",
"legendFormat": "{{ scaledObject }}",
"range": true,
"refId": "A"
Expand Down
10 changes: 6 additions & 4 deletions pkg/metricscollector/metricscollectors.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ limitations under the License.

package metricscollector

import "time"

const (
ClusterTriggerAuthenticationResource = "cluster_trigger_authentication"
TriggerAuthenticationResource = "trigger_authentication"
Expand All @@ -33,10 +35,10 @@ type MetricsCollector interface {
RecordScalerMetric(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64)

// RecordScalerLatency create a measurement of the latency to external metric
RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64)
RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value time.Duration)

// RecordScalableObjectLatency create a measurement of the latency executing scalable object loop
RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64)
RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration)

// RecordScalerActive create a measurement of the activity of the scaler
RecordScalerActive(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, active bool)
Expand Down Expand Up @@ -79,14 +81,14 @@ func RecordScalerMetric(namespace string, scaledObject string, scaler string, sc
}

// RecordScalerLatency create a measurement of the latency to external metric
func RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64) {
func RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value time.Duration) {
for _, element := range collectors {
element.RecordScalerLatency(namespace, scaledObject, scaler, scalerIndex, metric, value)
}
}

// RecordScalableObjectLatency create a measurement of the latency executing scalable object loop
func RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64) {
func RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration) {
for _, element := range collectors {
element.RecordScalableObjectLatency(namespace, name, isScaledObject, value)
}
Expand Down
57 changes: 38 additions & 19 deletions pkg/metricscollector/opentelemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"runtime"
"strconv"
"time"

"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
Expand All @@ -22,12 +23,14 @@ const meterName = "keda-open-telemetry-metrics"
const defaultNamespace = "default"

var (
meterProvider *metric.MeterProvider
meter api.Meter
otScalerErrorsCounter api.Int64Counter
otScaledObjectErrorsCounter api.Int64Counter
otTriggerTotalsCounter api.Int64UpDownCounter
otCrdTotalsCounter api.Int64UpDownCounter
meterProvider *metric.MeterProvider
meter api.Meter
otScalerErrorsCounter api.Int64Counter
otScaledObjectErrorsCounter api.Int64Counter
otTriggerTotalsCounterDeprecated api.Int64UpDownCounter
otCrdTotalsCounterDeprecated api.Int64UpDownCounter
otTriggerRegisteredTotalsCounter api.Int64UpDownCounter
otCrdRegisteredTotalsCounter api.Int64UpDownCounter

otelScalerMetricVal OtelMetricFloat64Val
otelScalerMetricsLatencyVal OtelMetricFloat64Val
Expand Down Expand Up @@ -86,19 +89,29 @@ func initMeters() {
otLog.Error(err, msg)
}

otTriggerTotalsCounter, err = meter.Int64UpDownCounter("keda.trigger.totals", api.WithDescription("Total triggers"))
otTriggerTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.trigger.totals", api.WithDescription("DEPRECATED - will be removed in 2.15 - use 'keda.triggers.count' instead"))
if err != nil {
otLog.Error(err, msg)
}

otCrdTotalsCounter, err = meter.Int64UpDownCounter("keda.resource.totals", api.WithDescription("Total resources"))
otTriggerRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.trigger.registered.count", api.WithDescription("Total number of triggers per trigger type registered"))
if err != nil {
otLog.Error(err, msg)
}

otCrdTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.resource.totals", api.WithDescription("DEPRECATED - will be removed in 2.15 - use 'keda.resources.count' instead"))
if err != nil {
otLog.Error(err, msg)
}

otCrdRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.resource.registered.count", api.WithDescription("Total number of KEDA custom resources per namespace for each custom resource type (CRD) registered"))
if err != nil {
otLog.Error(err, msg)
}

_, err = meter.Float64ObservableGauge(
"keda.scaler.metrics.value",
api.WithDescription("Metric Value used for HPA"),
api.WithDescription("The current value for each scaler's metric that would be used by the HPA in computing the target average"),
api.WithFloat64Callback(ScalerMetricValueCallback),
)
if err != nil {
Expand All @@ -107,7 +120,8 @@ func initMeters() {

_, err = meter.Float64ObservableGauge(
"keda.scaler.metrics.latency",
api.WithDescription("Scaler Metrics Latency"),
api.WithDescription("The latency of retrieving current metric from each scaler"),
api.WithUnit("s"),
api.WithFloat64Callback(ScalerMetricsLatencyCallback),
)
if err != nil {
Expand All @@ -117,6 +131,7 @@ func initMeters() {
_, err = meter.Float64ObservableGauge(
"keda.internal.scale.loop.latency",
api.WithDescription("Internal latency of ScaledObject/ScaledJob loop execution"),
api.WithUnit("s"),
api.WithFloat64Callback(ScalableObjectLatencyCallback),
)
if err != nil {
Expand All @@ -125,7 +140,7 @@ func initMeters() {

_, err = meter.Float64ObservableGauge(
"keda.scaler.active",
api.WithDescription("Activity of a Scaler Metric"),
api.WithDescription("Indicates whether a scaler is active (1), or not (0)"),
api.WithFloat64Callback(ScalerActiveCallback),
)
if err != nil {
Expand Down Expand Up @@ -185,8 +200,8 @@ func ScalerMetricsLatencyCallback(_ context.Context, obsrv api.Float64Observer)
}

// RecordScalerLatency create a measurement of the latency to external metric
func (o *OtelMetrics) RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64) {
otelScalerMetricsLatencyVal.val = value
func (o *OtelMetrics) RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value time.Duration) {
otelScalerMetricsLatencyVal.val = value.Seconds()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
otelScalerMetricsLatencyVal.val = value.Seconds()
otelScalerMetricsLatencyVal.val = value.Milliseconds()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's been settled that KEDA will use Seconds in opentelemetry metrics.

otelScalerMetricsLatencyVal.measurementOption = getScalerMeasurementOption(namespace, scaledObject, scaler, scalerIndex, metric)
}

Expand All @@ -199,7 +214,7 @@ func ScalableObjectLatencyCallback(_ context.Context, obsrv api.Float64Observer)
}

// RecordScalableObjectLatency create a measurement of the latency executing scalable object loop
func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64) {
func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration) {
resourceType := "scaledjob"
if isScaledObject {
resourceType = "scaledobject"
Expand All @@ -210,7 +225,7 @@ func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string,
attribute.Key("type").String(resourceType),
attribute.Key("name").String(name))

otelInternalLoopLatencyVal.val = value
otelInternalLoopLatencyVal.val = value.Seconds()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
otelInternalLoopLatencyVal.val = value.Seconds()
otelInternalLoopLatencyVal.val = value.Milliseconds()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's been settled that KEDA will use Seconds in opentelemetry metrics.

otelInternalLoopLatencyVal.measurementOption = opt
}

Expand Down Expand Up @@ -281,13 +296,15 @@ func (o *OtelMetrics) RecordScaledObjectError(namespace string, scaledObject str

func (o *OtelMetrics) IncrementTriggerTotal(triggerType string) {
if triggerType != "" {
otTriggerTotalsCounter.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType)))
otTriggerTotalsCounterDeprecated.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType)))
otTriggerRegisteredTotalsCounter.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType)))
}
}

func (o *OtelMetrics) DecrementTriggerTotal(triggerType string) {
if triggerType != "" {
otTriggerTotalsCounter.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType)))
otTriggerTotalsCounterDeprecated.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType)))
otTriggerRegisteredTotalsCounter.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType)))
}
}

Expand All @@ -300,7 +317,8 @@ func (o *OtelMetrics) IncrementCRDTotal(crdType, namespace string) {
attribute.Key("type").String(crdType),
)

otCrdTotalsCounter.Add(context.Background(), 1, opt)
otCrdTotalsCounterDeprecated.Add(context.Background(), 1, opt)
otCrdRegisteredTotalsCounter.Add(context.Background(), 1, opt)
}

func (o *OtelMetrics) DecrementCRDTotal(crdType, namespace string) {
Expand All @@ -312,7 +330,8 @@ func (o *OtelMetrics) DecrementCRDTotal(crdType, namespace string) {
attribute.Key("namespace").String(namespace),
attribute.Key("type").String(crdType),
)
otCrdTotalsCounter.Add(context.Background(), -1, opt)
otCrdTotalsCounterDeprecated.Add(context.Background(), -1, opt)
otCrdRegisteredTotalsCounter.Add(context.Background(), -1, opt)
}

func getScalerMeasurementOption(namespace string, scaledObject string, scaler string, scalerIndex int, metric string) api.MeasurementOption {
Expand Down
30 changes: 24 additions & 6 deletions pkg/metricscollector/opentelemetry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package metricscollector
import (
"context"
"testing"
"time"

"github.com/stretchr/testify/assert"
"go.opentelemetry.io/otel/sdk/metric"
Expand Down Expand Up @@ -59,11 +60,11 @@ func TestIncrementTriggerTotal(t *testing.T) {
assert.Nil(t, err)
scopeMetrics := got.ScopeMetrics[0]
assert.NotEqual(t, len(scopeMetrics.Metrics), 0)
buildInfo := retrieveMetric(scopeMetrics.Metrics, "keda.trigger.totals")
triggercount := retrieveMetric(scopeMetrics.Metrics, "keda.trigger.registered.count")

assert.NotNil(t, buildInfo)
assert.NotNil(t, triggercount)

data := buildInfo.Data.(metricdata.Sum[int64]).DataPoints[0]
data := triggercount.Data.(metricdata.Sum[int64]).DataPoints[0]
assert.Equal(t, data.Value, int64(1))

testOtel.DecrementTriggerTotal("testtrigger")
Expand All @@ -72,10 +73,27 @@ func TestIncrementTriggerTotal(t *testing.T) {
assert.Nil(t, err)
scopeMetrics = got.ScopeMetrics[0]
assert.NotEqual(t, len(scopeMetrics.Metrics), 0)
buildInfo = retrieveMetric(scopeMetrics.Metrics, "keda.trigger.totals")
triggercount = retrieveMetric(scopeMetrics.Metrics, "keda.trigger.registered.count")

assert.NotNil(t, buildInfo)
assert.NotNil(t, triggercount)

data = buildInfo.Data.(metricdata.Sum[int64]).DataPoints[0]
data = triggercount.Data.(metricdata.Sum[int64]).DataPoints[0]
assert.Equal(t, data.Value, int64(0))
}

func TestLoopLatency(t *testing.T) {
testOtel.RecordScalableObjectLatency("namespace", "name", true, 500*time.Millisecond)
got := metricdata.ResourceMetrics{}
err := testReader.Collect(context.Background(), &got)

assert.Nil(t, err)
scopeMetrics := got.ScopeMetrics[0]
assert.NotEqual(t, len(scopeMetrics.Metrics), 0)
latency := retrieveMetric(scopeMetrics.Metrics, "keda.internal.scale.loop.latency")

assert.NotNil(t, latency)
assert.Equal(t, latency.Unit, "s")

data := latency.Data.(metricdata.Gauge[float64]).DataPoints[0]
assert.Equal(t, data.Value, float64(0.5))
}
Loading