Fix units for metric taskrun_pod_latency

Prior to this commit, the taskrun_pod_latency metric was calculated using pod timestamps and directly casted to float64. This calculation resulted in incorrectly using nanoseconds, when the metric was intended to have units of milliseconds. This commit fixes the duration conversion and adds units to the metric name, in line with Prometheus best practices.
QuanZhang-William · Jul 10, 2023 · 29ddf85 · 29ddf85
1 parent 75dc883
commit 29ddf85
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 6 deletions.
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -22,7 +22,7 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
 | `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
 | `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
 | `tekton_pipelines_controller_running_taskruns_throttled_by_node_count`  | Gauge | | experimental |
-| `tekton_pipelines_controller_taskruns_pod_latency` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
+| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
 | `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |
 
 The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics.

diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go
@@ -86,7 +86,7 @@ var (
 		"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
 		stats.UnitDimensionless)
 
-	podLatency = stats.Float64("taskruns_pod_latency",
+	podLatency = stats.Float64("taskruns_pod_latency_milliseconds",
 		"scheduling latency for the taskruns pods",
 		stats.UnitMilliseconds)
 )
@@ -438,7 +438,7 @@ func (r *Recorder) RecordPodLatency(ctx context.Context, pod *corev1.Pod, tr *v1
 		return err
 	}
 
-	metrics.Record(ctx, podLatency.M(float64(latency)))
+	metrics.Record(ctx, podLatency.M(float64(latency.Milliseconds())))
 
 	return nil
 }

diff --git a/pkg/taskrunmetrics/metrics_test.go b/pkg/taskrunmetrics/metrics_test.go
@@ -531,7 +531,7 @@ func TestRecordPodLatency(t *testing.T) {
 			"taskrun":   "test-taskrun",
 			"namespace": "foo",
 		},
-		expectedValue: 4e+09,
+		expectedValue: 4000,
 	}, {
 		name: "for non scheduled pod",
 		pod: &corev1.Pod{
@@ -559,7 +559,7 @@ func TestRecordPodLatency(t *testing.T) {
 				if err != nil {
 					t.Errorf("RecordPodLatency: %v", err)
 				}
-				metricstest.CheckLastValueData(t, "taskruns_pod_latency", td.expectedTags, td.expectedValue)
+				metricstest.CheckLastValueData(t, "taskruns_pod_latency_milliseconds", td.expectedTags, td.expectedValue)
 			}
 		})
 	}
@@ -610,7 +610,7 @@ func TestTaskRunIsOfPipelinerun(t *testing.T) {
 }
 
 func unregisterMetrics() {
-	metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "taskruns_pod_latency")
+	metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "taskruns_pod_latency_milliseconds")
 
 	// Allow the recorder singleton to be recreated.
 	once = sync.Once{}