From c748b15efe51f7c3e1da26a1159cc62e61626e56 Mon Sep 17 00:00:00 2001 From: Hector Fernandez Date: Mon, 24 Feb 2020 20:49:44 +0100 Subject: [PATCH 1/4] feat: add custom kubefed metrics --- .../app/controller-manager.go | 3 + .../kubefedcluster/clusterclient.go | 4 + pkg/controller/kubefedcluster/controller.go | 6 + pkg/controller/sync/controller.go | 2 + pkg/controller/util/federated_informer.go | 5 + pkg/metrics/metrics.go | 118 ++++++++++++++++++ 6 files changed, 138 insertions(+) create mode 100644 pkg/metrics/metrics.go diff --git a/cmd/controller-manager/app/controller-manager.go b/cmd/controller-manager/app/controller-manager.go index 41e50bd492..815d7f139c 100644 --- a/cmd/controller-manager/app/controller-manager.go +++ b/cmd/controller-manager/app/controller-manager.go @@ -57,6 +57,7 @@ import ( "sigs.k8s.io/kubefed/pkg/controller/servicedns" "sigs.k8s.io/kubefed/pkg/controller/util" "sigs.k8s.io/kubefed/pkg/features" + kubefedmetrics "sigs.k8s.io/kubefed/pkg/metrics" "sigs.k8s.io/kubefed/pkg/version" ) @@ -114,6 +115,8 @@ func Run(opts *options.Options, stopChan <-chan struct{}) error { go serveHealthz(healthzAddr) go serveMetrics(metricsAddr, stopChan) + // Register kubefed custom metrics + kubefedmetrics.RegisterAll() var err error opts.Config.KubeConfig, err = clientcmd.BuildConfigFromFlags(masterURL, kubeconfig) diff --git a/pkg/controller/kubefedcluster/clusterclient.go b/pkg/controller/kubefedcluster/clusterclient.go index cc70c8f46f..241379ea5c 100644 --- a/pkg/controller/kubefedcluster/clusterclient.go +++ b/pkg/controller/kubefedcluster/clusterclient.go @@ -34,6 +34,7 @@ import ( fedv1b1 "sigs.k8s.io/kubefed/pkg/apis/core/v1beta1" "sigs.k8s.io/kubefed/pkg/client/generic" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) const ( @@ -128,10 +129,13 @@ func (self *ClusterClient) GetClusterHealthStatus() (*fedv1b1.KubeFedClusterStat if err != nil { runtime.HandleError(errors.Wrapf(err, "Failed to do cluster health check for cluster %q", self.clusterName)) clusterStatus.Conditions = append(clusterStatus.Conditions, newClusterOfflineCondition) + metrics.RegisterKubefedClusterOfflineCount(self.clusterName) } else { if !strings.EqualFold(string(body), "ok") { + metrics.RegisterKubefedClusterNotReadyCount(self.clusterName) clusterStatus.Conditions = append(clusterStatus.Conditions, newClusterNotReadyCondition, newClusterNotOfflineCondition) } else { + metrics.RegisterKubefedClusterReadyCount(self.clusterName) clusterStatus.Conditions = append(clusterStatus.Conditions, newClusterReadyCondition) } } diff --git a/pkg/controller/kubefedcluster/controller.go b/pkg/controller/kubefedcluster/controller.go index f6025d9123..ccdb1de48c 100644 --- a/pkg/controller/kubefedcluster/controller.go +++ b/pkg/controller/kubefedcluster/controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "sync" + "time" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" @@ -41,6 +42,7 @@ import ( genscheme "sigs.k8s.io/kubefed/pkg/client/generic/scheme" "sigs.k8s.io/kubefed/pkg/controller/util" "sigs.k8s.io/kubefed/pkg/features" + "sigs.k8s.io/kubefed/pkg/metrics" ) // ClusterData stores cluster client and previous health check probe results of individual cluster. @@ -241,6 +243,7 @@ func (cc *ClusterController) updateIndividualClusterStatus(cluster *fedv1b1.Kube storedData *ClusterData, wg *sync.WaitGroup) { clusterClient := storedData.clusterKubeClient + clusterHealthStatusStart := time.Now() currentClusterStatus, err := clusterClient.GetClusterHealthStatus() if err != nil { cc.RecordError(cluster, "RetrievingClusterHealthFailed", errors.Wrap(err, "Failed to retrieve health of the cluster")) @@ -257,6 +260,9 @@ func (cc *ClusterController) updateIndividualClusterStatus(cluster *fedv1b1.Kube if err := cc.client.UpdateStatus(context.TODO(), cluster); err != nil { klog.Warningf("Failed to update the status of cluster %q: %v", cluster.Name, err) } + + metrics.UpdateDurationFromStart(metrics.ClusterHealthStatus, clusterHealthStatusStart) + wg.Done() } diff --git a/pkg/controller/sync/controller.go b/pkg/controller/sync/controller.go index dc6eac572f..2360293a33 100644 --- a/pkg/controller/sync/controller.go +++ b/pkg/controller/sync/controller.go @@ -46,6 +46,7 @@ import ( "sigs.k8s.io/kubefed/pkg/controller/sync/status" "sigs.k8s.io/kubefed/pkg/controller/util" finalizersutil "sigs.k8s.io/kubefed/pkg/controller/util/finalizers" + "sigs.k8s.io/kubefed/pkg/metrics" ) const ( @@ -266,6 +267,7 @@ func (s *KubeFedSyncController) reconcile(qualifiedName util.QualifiedName) util startTime := time.Now() defer func() { klog.V(4).Infof("Finished reconciling %s %q (duration: %v)", kind, key, time.Since(startTime)) + metrics.UpdateDurationFromStart(metrics.ReconcileFederatedResources, startTime) }() if fedResource.Object().GetDeletionTimestamp() != nil { diff --git a/pkg/controller/util/federated_informer.go b/pkg/controller/util/federated_informer.go index f268c3ad6a..649af10f4c 100644 --- a/pkg/controller/util/federated_informer.go +++ b/pkg/controller/util/federated_informer.go @@ -34,6 +34,7 @@ import ( fedcommon "sigs.k8s.io/kubefed/pkg/apis/core/common" fedv1b1 "sigs.k8s.io/kubefed/pkg/apis/core/v1beta1" "sigs.k8s.io/kubefed/pkg/client/generic" + "sigs.k8s.io/kubefed/pkg/metrics" ) const ( @@ -324,6 +325,8 @@ func (f *federatedInformerImpl) GetClientForCluster(clusterName string) (generic f.Lock() defer f.Unlock() + clientConnectionStart := time.Now() + // return cached client if one exists (to prevent frequent secret retrieval and rest discovery) if client, ok := f.clusterClients[clusterName]; ok { return client, nil @@ -337,6 +340,8 @@ func (f *federatedInformerImpl) GetClientForCluster(clusterName string) (generic return client, err } f.clusterClients[clusterName] = client + + metrics.UpdateDurationFromStart(metrics.ClusterClientConnection, clientConnectionStart) return client, nil } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 0000000000..0dbd594980 --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,118 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/klog" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + kubefedClusterNotReadyCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kubefedcluster_not_ready_total", + Help: "Number of total not ready states of a kubefed cluster.", + }, []string{"cluster"} + ) + + kubefedClusterReadyCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kubefedcluster_ready_total", + Help: "Number of total ready states of a kubefed cluster.", + }, []string{"cluster"} + ) + + kubefedClusterOfflineCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "kubefedcluster_offline_total", + Help: "Number of total offline states of a kubefed cluster.", + }, []string{"cluster"} + ) + + functionDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "function_duration_seconds", + Help: "Time taken by various parts of Kubefed main loops.", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, + }, []string{"function"}, + ) + + functionDurationSummary = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Name: "function_duration_quantile_seconds", + Help: "Quantiles of time taken by various parts of Kubefed main loops.", + MaxAge: time.Hour, + }, []string{"function"}, + ) +) + +// FunctionLabel is a name of Kubefed operation for which +// we measure duration +type FunctionLabel string + +const ( + // LogLongDurationThreshold defines the duration after which long function + // duration will be logged. + LogLongDurationThreshold = 5 * time.Second +) + +// Names of Kubefed operations +const ( + ClusterHealthStatus FunctionLabel = "clusterHealthStatus" + ReconcileFederatedResources FunctionLabel = "reconcile:federatedResources" + ClusterClientConnection FunctionLabel = "clusterClientConnection" +) + +// RegisterAll registers all metrics. +func RegisterAll() { + metrics.Registry.MustRegister(kubefedClustersNotReadyCount, kubefedClustersReadyCount, kubefedClustersOfflineCount, functionDuration, functionDurationSummary) +} + +// UpdateDurationFromStart records the duration of the step identified by the +// label using start time +func UpdateDurationFromStart(label FunctionLabel, start time.Time) { + duration := time.Since(start) + UpdateDuration(label, duration) +} + +// RegisterKubefedClusterReadyCount records number of Ready states of a Kubefed cluster +func RegisterKubefedClusterReadyCount(clusterName string) { + kubefedClusterReadyCount.WithLabelValues(clusterName).Inc() +} + +// RegisterKubefedClusterOfflineCount records number of Offline states of a Kubefed cluster +func RegisterKubefedClusterOfflineCount(clusterName string) { + kubefedClusterOfflineCount.WithLabelValues(clusterName).Inc() +} + +// RegisterKubefedClusterReadyCount records number of NOT Ready states of a Kubefed cluster +func RegisterKubefedClusterNotReadyCount(clusterName string) { + kubefedClusterNotReadyCount.WithLabelValues(clusterName).Inc() +} + +// UpdateDuration records the duration of the step identified by the label +func UpdateDuration(label FunctionLabel, duration time.Duration) { + if duration > LogLongDurationThreshold { + klog.V(4).Infof("Function %s took %v to complete", label, duration) + } + + functionDurationSummary.WithLabelValues(string(label)).Observe(duration.Seconds()) + functionDuration.WithLabelValues(string(label)).Observe(duration.Seconds()) +} From 0bfbbf9737b664e4b589864553961746d6d5d2ff Mon Sep 17 00:00:00 2001 From: Hector Fernandez Date: Tue, 10 Mar 2020 00:46:01 +0100 Subject: [PATCH 2/4] chore: add more metrics --- .../kubefedcluster/clusterclient.go | 6 +- pkg/controller/kubefedcluster/controller.go | 2 +- pkg/controller/sync/controller.go | 2 +- pkg/controller/sync/dispatch/managed.go | 6 +- pkg/controller/sync/dispatch/unmanaged.go | 4 + pkg/controller/util/federated_informer.go | 2 +- pkg/kubefedctl/join.go | 4 + pkg/kubefedctl/unjoin.go | 11 +- pkg/metrics/metrics.go | 204 +++++++++++++----- 9 files changed, 178 insertions(+), 63 deletions(-) diff --git a/pkg/controller/kubefedcluster/clusterclient.go b/pkg/controller/kubefedcluster/clusterclient.go index 241379ea5c..9272c67e47 100644 --- a/pkg/controller/kubefedcluster/clusterclient.go +++ b/pkg/controller/kubefedcluster/clusterclient.go @@ -129,13 +129,13 @@ func (self *ClusterClient) GetClusterHealthStatus() (*fedv1b1.KubeFedClusterStat if err != nil { runtime.HandleError(errors.Wrapf(err, "Failed to do cluster health check for cluster %q", self.clusterName)) clusterStatus.Conditions = append(clusterStatus.Conditions, newClusterOfflineCondition) - metrics.RegisterKubefedClusterOfflineCount(self.clusterName) + metrics.RegisterKubefedClusterTotal(metrics.ClusterOffline, self.clusterName) } else { if !strings.EqualFold(string(body), "ok") { - metrics.RegisterKubefedClusterNotReadyCount(self.clusterName) + metrics.RegisterKubefedClusterTotal(metrics.ClusterNotReady, self.clusterName) clusterStatus.Conditions = append(clusterStatus.Conditions, newClusterNotReadyCondition, newClusterNotOfflineCondition) } else { - metrics.RegisterKubefedClusterReadyCount(self.clusterName) + metrics.RegisterKubefedClusterTotal(metrics.ClusterReady, self.clusterName) clusterStatus.Conditions = append(clusterStatus.Conditions, newClusterReadyCondition) } } diff --git a/pkg/controller/kubefedcluster/controller.go b/pkg/controller/kubefedcluster/controller.go index ccdb1de48c..3e042e7066 100644 --- a/pkg/controller/kubefedcluster/controller.go +++ b/pkg/controller/kubefedcluster/controller.go @@ -261,7 +261,7 @@ func (cc *ClusterController) updateIndividualClusterStatus(cluster *fedv1b1.Kube klog.Warningf("Failed to update the status of cluster %q: %v", cluster.Name, err) } - metrics.UpdateDurationFromStart(metrics.ClusterHealthStatus, clusterHealthStatusStart) + metrics.ClusterHealthStatusDurationFromStart(clusterHealthStatusStart) wg.Done() } diff --git a/pkg/controller/sync/controller.go b/pkg/controller/sync/controller.go index 2360293a33..85cad7cffe 100644 --- a/pkg/controller/sync/controller.go +++ b/pkg/controller/sync/controller.go @@ -267,7 +267,7 @@ func (s *KubeFedSyncController) reconcile(qualifiedName util.QualifiedName) util startTime := time.Now() defer func() { klog.V(4).Infof("Finished reconciling %s %q (duration: %v)", kind, key, time.Since(startTime)) - metrics.UpdateDurationFromStart(metrics.ReconcileFederatedResources, startTime) + metrics.ReconcileFederatedResourcesDurationFromStart(startTime) }() if fedResource.Object().GetDeletionTimestamp() != nil { diff --git a/pkg/controller/sync/dispatch/managed.go b/pkg/controller/sync/dispatch/managed.go index 930440659f..24e9d6924d 100644 --- a/pkg/controller/sync/dispatch/managed.go +++ b/pkg/controller/sync/dispatch/managed.go @@ -21,6 +21,7 @@ import ( "fmt" "strings" "sync" + "time" "github.com/pkg/errors" @@ -33,6 +34,7 @@ import ( "sigs.k8s.io/kubefed/pkg/client/generic" "sigs.k8s.io/kubefed/pkg/controller/sync/status" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) // FederatedResourceForDispatch is the subset of the FederatedResource @@ -130,7 +132,7 @@ func (d *managedDispatcherImpl) Create(clusterName string) { // operation timed out. The timeout status will be cleared by // Wait() if a timeout does not occur. d.RecordStatus(clusterName, status.CreationTimedOut) - + start := time.Now() d.dispatcher.incrementOperationsInitiated() const op = "create" go d.dispatcher.clusterOperation(clusterName, op, func(client generic.Client) util.ReconciliationStatus { @@ -150,6 +152,7 @@ func (d *managedDispatcherImpl) Create(clusterName string) { if err == nil { version := util.ObjectVersion(obj) d.recordVersion(clusterName, version) + metrics.DispatchOperationDurationFromStart("create", start) return util.StatusAllOK } @@ -175,6 +178,7 @@ func (d *managedDispatcherImpl) Create(clusterName string) { d.recordError(clusterName, op, errors.Errorf("An update will be attempted instead of a creation due to an existing resource")) d.Update(clusterName, obj) + metrics.DispatchOperationDurationFromStart("update", start) return util.StatusAllOK }) } diff --git a/pkg/controller/sync/dispatch/unmanaged.go b/pkg/controller/sync/dispatch/unmanaged.go index 615680d41a..374ddf1c6f 100644 --- a/pkg/controller/sync/dispatch/unmanaged.go +++ b/pkg/controller/sync/dispatch/unmanaged.go @@ -18,6 +18,7 @@ package dispatch import ( "context" + "time" "github.com/pkg/errors" @@ -30,6 +31,7 @@ import ( "sigs.k8s.io/kubefed/pkg/client/generic" "sigs.k8s.io/kubefed/pkg/controller/sync/status" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) const eventTemplate = "%s %s %q in cluster %q" @@ -71,6 +73,7 @@ func (d *unmanagedDispatcherImpl) Wait() (bool, error) { } func (d *unmanagedDispatcherImpl) Delete(clusterName string) { + start := time.Now() d.dispatcher.incrementOperationsInitiated() const op = "delete" const opContinuous = "Deleting" @@ -97,6 +100,7 @@ func (d *unmanagedDispatcherImpl) Delete(clusterName string) { } return util.StatusError } + metrics.DispatchOperationDurationFromStart("delete", start) return util.StatusAllOK }) } diff --git a/pkg/controller/util/federated_informer.go b/pkg/controller/util/federated_informer.go index 649af10f4c..363ea9eb92 100644 --- a/pkg/controller/util/federated_informer.go +++ b/pkg/controller/util/federated_informer.go @@ -341,7 +341,7 @@ func (f *federatedInformerImpl) GetClientForCluster(clusterName string) (generic } f.clusterClients[clusterName] = client - metrics.UpdateDurationFromStart(metrics.ClusterClientConnection, clientConnectionStart) + metrics.ClusterClientConnectionDurationFromStart(clientConnectionStart) return client, nil } diff --git a/pkg/kubefedctl/join.go b/pkg/kubefedctl/join.go index 27d0e60e1f..af51246bc8 100644 --- a/pkg/kubefedctl/join.go +++ b/pkg/kubefedctl/join.go @@ -43,6 +43,7 @@ import ( ctlutil "sigs.k8s.io/kubefed/pkg/controller/util" "sigs.k8s.io/kubefed/pkg/kubefedctl/options" "sigs.k8s.io/kubefed/pkg/kubefedctl/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) const ( @@ -218,6 +219,7 @@ func JoinCluster(hostConfig, clusterConfig *rest.Config, kubefedNamespace, func joinClusterForNamespace(hostConfig, clusterConfig *rest.Config, kubefedNamespace, joiningNamespace, hostClusterName, joiningClusterName, secretName string, scope apiextv1b1.ResourceScope, dryRun, errorOnExisting bool) (*fedv1b1.KubeFedCluster, error) { + start := time.Now() hostClientset, err := util.HostClientset(hostConfig) if err != nil { @@ -280,6 +282,8 @@ func joinClusterForNamespace(hostConfig, clusterConfig *rest.Config, kubefedName } klog.V(2).Info("Created federated cluster resource") + metrics.JoinedClusterTotalInc() + metrics.JoinedClusterDurationFromStart(start) return kubefedCluster, nil } diff --git a/pkg/kubefedctl/unjoin.go b/pkg/kubefedctl/unjoin.go index 7aa7a25353..577a920d94 100644 --- a/pkg/kubefedctl/unjoin.go +++ b/pkg/kubefedctl/unjoin.go @@ -21,6 +21,7 @@ import ( goerrors "errors" "io" "strings" + "time" "github.com/pkg/errors" "github.com/spf13/cobra" @@ -37,6 +38,7 @@ import ( controllerutil "sigs.k8s.io/kubefed/pkg/controller/util" "sigs.k8s.io/kubefed/pkg/kubefedctl/options" "sigs.k8s.io/kubefed/pkg/kubefedctl/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) var ( @@ -172,6 +174,7 @@ func (j *unjoinFederation) Run(cmdOut io.Writer, config util.FedConfig) error { // required set of parameters are passed in. func UnjoinCluster(hostConfig, clusterConfig *rest.Config, kubefedNamespace, hostClusterName, unjoiningClusterContext, unjoiningClusterName string, forceDeletion, dryRun bool) error { + start := time.Now() hostClientset, err := util.HostClientset(hostConfig) if err != nil { @@ -215,7 +218,13 @@ func UnjoinCluster(hostConfig, clusterConfig *rest.Config, kubefedNamespace, hos } // deletionSucceeded when all operations in deleteRBACResources and deleteFedNSFromUnjoinCluster succeed. - return deleteFederatedClusterAndSecret(hostClientset, client, kubefedNamespace, unjoiningClusterName, forceDeletion, dryRun) + err = deleteFederatedClusterAndSecret(hostClientset, client, kubefedNamespace, unjoiningClusterName, forceDeletion, dryRun) + if err != nil { + return err + } + metrics.JoinedClusterTotalDec() + metrics.UnjoinedClusterDurationFromStart(start) + return nil } // deleteKubeFedClusterAndSecret deletes a federated cluster resource that associates diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 0dbd594980..340f475b1b 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -25,94 +25,188 @@ import ( ) var ( - kubefedClusterNotReadyCount = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "kubefedcluster_not_ready_total", - Help: "Number of total not ready states of a kubefed cluster.", - }, []string{"cluster"} + kubefedClusterTotal = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kubefedcluster_total", + Help: "Number of total kubefed cluster in a specific state.", + }, []string{"state", "cluster"}, ) - kubefedClusterReadyCount = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "kubefedcluster_ready_total", - Help: "Number of total ready states of a kubefed cluster.", - }, []string{"cluster"} + joinedClusterTotal = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "joinedcluster_total", + Help: "Number of total joined clusters.", + }, ) - kubefedClusterOfflineCount = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "kubefedcluster_offline_total", - Help: "Number of total offline states of a kubefed cluster.", - }, []string{"cluster"} + clusterHealthStatusDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "cluster_health_status_duration_seconds", + Help: "Time taken for the cluster health periodic function.", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, + }, + ) + + clusterClientConnectionDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "cluster_client_connection_duration_seconds", + Help: "Time taken for the cluster client connection function.", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, + }, ) - functionDuration = prometheus.NewHistogramVec( + reconcileFederatedResourcesDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ - Name: "function_duration_seconds", - Help: "Time taken by various parts of Kubefed main loops.", + Name: "reconcile_federated_resources_duration_seconds", + Help: "Time taken to reconcile federated resources in the target clusters.", Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, - }, []string{"function"}, + }, ) - functionDurationSummary = prometheus.NewSummaryVec( + joinedClusterDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "join_cluster_duration_seconds", + Help: "Time taken to join a cluster.", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, + }, + ) + + unjoinedClusterDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "unjoin_cluster_duration_seconds", + Help: "Time taken to unjoin a cluster.", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, + }, + ) + + dispatchOperationDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "dispatch_operation_duration_seconds", + Help: "Time taken to run dispatch operation.", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, + }, []string{"action"}, + ) + + controllerRuntimeReconcileDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "controller_runtime_reconcile_duration_seconds", + Help: "Time taken by various parts of Kubefed controllers reconciliation loops.", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, + }, []string{"controller"}, + ) + + controllerRuntimeReconcileDurationSummary = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "function_duration_quantile_seconds", - Help: "Quantiles of time taken by various parts of Kubefed main loops.", + Name: "controller_runtime_reconcile_quantile_seconds", + Help: "Quantiles of time taken by various parts of Kubefed controllers reconciliation loops.", MaxAge: time.Hour, - }, []string{"function"}, + }, []string{"controller"}, ) ) -// FunctionLabel is a name of Kubefed operation for which -// we measure duration -type FunctionLabel string - const ( - // LogLongDurationThreshold defines the duration after which long function + // LogReconcileLongDurationThreshold defines the duration after which long function // duration will be logged. - LogLongDurationThreshold = 5 * time.Second -) + LogReconcileLongDurationThreshold = 10 * time.Second -// Names of Kubefed operations -const ( - ClusterHealthStatus FunctionLabel = "clusterHealthStatus" - ReconcileFederatedResources FunctionLabel = "reconcile:federatedResources" - ClusterClientConnection FunctionLabel = "clusterClientConnection" + ClusterNotReady = "notready" + ClusterReady = "ready" + ClusterOffline = "offline" ) // RegisterAll registers all metrics. func RegisterAll() { - metrics.Registry.MustRegister(kubefedClustersNotReadyCount, kubefedClustersReadyCount, kubefedClustersOfflineCount, functionDuration, functionDurationSummary) + metrics.Registry.MustRegister( + kubefedClusterTotal, + joinedClusterTotal, + reconcileFederatedResourcesDuration, + clusterHealthStatusDuration, + clusterClientConnectionDuration, + joinedClusterDuration, + unjoinedClusterDuration, + dispatchOperationDuration, + controllerRuntimeReconcileDuration, + controllerRuntimeReconcileDurationSummary, + ) +} + +// RegisterKubefedClusterTotal records number of kubefed clusters in a specific state +func RegisterKubefedClusterTotal(state, cluster string) { + switch state { + case ClusterReady: + kubefedClusterTotal.WithLabelValues(state, cluster).Set(1) + kubefedClusterTotal.WithLabelValues(ClusterNotReady, cluster).Set(0) + kubefedClusterTotal.WithLabelValues(ClusterOffline, cluster).Set(0) + case ClusterNotReady: + kubefedClusterTotal.WithLabelValues(state, cluster).Set(1) + kubefedClusterTotal.WithLabelValues(ClusterOffline, cluster).Set(0) + kubefedClusterTotal.WithLabelValues(ClusterReady, cluster).Set(0) + case ClusterOffline: + kubefedClusterTotal.WithLabelValues(state, cluster).Set(1) + kubefedClusterTotal.WithLabelValues(ClusterNotReady, cluster).Set(0) + kubefedClusterTotal.WithLabelValues(ClusterReady, cluster).Set(0) + } +} + +// JoinedClusterTotalInc increases by one the number of joined kubefed clusters +func JoinedClusterTotalInc() { + joinedClusterTotal.Inc() +} + +// JoinedClusterTotalDec decreases by one the number of joined kubefed clusters +func JoinedClusterTotalDec() { + joinedClusterTotal.Dec() +} + +// DispatchOperationDurationFromStart records the duration of the step identified by the action name +func DispatchOperationDurationFromStart(action string, start time.Time) { + duration := time.Since(start) + dispatchOperationDuration.WithLabelValues(action).Observe(duration.Seconds()) } -// UpdateDurationFromStart records the duration of the step identified by the -// label using start time -func UpdateDurationFromStart(label FunctionLabel, start time.Time) { +// ClusterHealthStatusDurationFromStart records the duration of the cluster health status operation +func ClusterHealthStatusDurationFromStart(start time.Time) { duration := time.Since(start) - UpdateDuration(label, duration) + clusterHealthStatusDuration.Observe(duration.Seconds()) } -// RegisterKubefedClusterReadyCount records number of Ready states of a Kubefed cluster -func RegisterKubefedClusterReadyCount(clusterName string) { - kubefedClusterReadyCount.WithLabelValues(clusterName).Inc() +// ClusterClientConnectionDurationFromStart records the duration of the cluster client connection operation +func ClusterClientConnectionDurationFromStart(start time.Time) { + duration := time.Since(start) + clusterClientConnectionDuration.Observe(duration.Seconds()) } -// RegisterKubefedClusterOfflineCount records number of Offline states of a Kubefed cluster -func RegisterKubefedClusterOfflineCount(clusterName string) { - kubefedClusterOfflineCount.WithLabelValues(clusterName).Inc() +// JoinedClusterDurationFromStart records the duration of the cluster joined operation +func JoinedClusterDurationFromStart(start time.Time) { + duration := time.Since(start) + joinedClusterDuration.Observe(duration.Seconds()) } -// RegisterKubefedClusterReadyCount records number of NOT Ready states of a Kubefed cluster -func RegisterKubefedClusterNotReadyCount(clusterName string) { - kubefedClusterNotReadyCount.WithLabelValues(clusterName).Inc() +// UnjoinedClusterDurationFromStart records the duration of the cluster unjoined operation +func UnjoinedClusterDurationFromStart(start time.Time) { + duration := time.Since(start) + unjoinedClusterDuration.Observe(duration.Seconds()) +} + +// ReconcileFederatedResourcesDurationFromStart records the duration of the federation of resources +func ReconcileFederatedResourcesDurationFromStart(start time.Time) { + duration := time.Since(start) + reconcileFederatedResourcesDuration.Observe(duration.Seconds()) +} + +// UpdateControllerReconcileDurationFromStart records the duration of the reconcile loop +// of a controller +func UpdateControllerReconcileDurationFromStart(controller string, start time.Time) { + duration := time.Since(start) + UpdateControllerReconcileDuration(controller, duration) } -// UpdateDuration records the duration of the step identified by the label -func UpdateDuration(label FunctionLabel, duration time.Duration) { - if duration > LogLongDurationThreshold { - klog.V(4).Infof("Function %s took %v to complete", label, duration) +// UpdateControllerReconcileDuration records the duration of the reconcile function of a controller +func UpdateControllerReconcileDuration(controller string, duration time.Duration) { + if duration > LogReconcileLongDurationThreshold { + klog.V(4).Infof("Reconcile loop %s took %v to complete", controller, duration) } - functionDurationSummary.WithLabelValues(string(label)).Observe(duration.Seconds()) - functionDuration.WithLabelValues(string(label)).Observe(duration.Seconds()) + controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds()) + controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds()) } From 786aa4b915eb54c43619d67174d55260ab48979e Mon Sep 17 00:00:00 2001 From: Hector Fernandez Date: Tue, 10 Mar 2020 14:47:30 +0100 Subject: [PATCH 3/4] docs: add kubefed-custom-metrics kep --- docs/keps/20200302-kubefed-metrics.md | 148 ++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 docs/keps/20200302-kubefed-metrics.md diff --git a/docs/keps/20200302-kubefed-metrics.md b/docs/keps/20200302-kubefed-metrics.md new file mode 100644 index 0000000000..619ac558bd --- /dev/null +++ b/docs/keps/20200302-kubefed-metrics.md @@ -0,0 +1,148 @@ +--- +kep-number: 0 +short-desc: Kubefed Custom Metrics +title: Kubefed Custom Metrics +authors: + - "@hectorj2f" +reviewers: + - "@jimmidyson" + - "@pmorie" + - "@xunpan" +approvers: + - "@jimmidyson" + - "@pmorie" + - "@xunpan" +editor: TBD +creation-date: 2020-03-02 +last-updated: 2020-03-02 +status: provisional +--- + +# Kubefed Custom Metrics + +## Table of Contents + +* [Kubefed Custom Metrics](#kubefed-custom-metrics) + * [Table of Contents](#table-of-contents) + * [Summary](#summary) + * [Motivation](#motivation) + * [Goals](#goals) + * [Non\-Goals](#non-goals) + * [Proposals](#proposals) + * [Metrics](#metrics) + * [Risks and Mitigations](#risks-and-mitigations) + * [Graduation Criteria](#graduation-criteria) + * [Implementation History](#implementation-history) + * [Drawbacks](#drawbacks) + * [Infrastructure Needed](#infrastructure-needed) + +## Summary + +This document describes the different metrics and valuable data that could be exposed +and consumed from Kubefed to create dashboards and better understand this engine. + +## Motivation + +We aim to define a generic strategy on how to identify, consume and expose +metrics from our core components in Kommander. + + +### Goals + +* Identify which metrics should be exposed from Kubefed if possible. +* Define a set of Kubefed metrics that could be consumed by Prometheus tools. +* Specify the type of each metric (e.g histogram, gauge, counter, summary). +* Use these metrics to create Grafana dashboards. + +### Non-Goals + +* Technical details about the Grafana Dashbards. + +## Proposals + +Kubefed already exposes a small set of metrics. These are some of the default metrics provided by +the [controller-runtime](https://github.com/kubernetes-sigs/controller-runtime/tree/master/pkg/metrics), in particular, Kubefed only exposes the client-only metrics. The rest of metrics are not available because Kubefed was not implemented +using the `controller-runtime` utils. + +The metrics +are exposed by a `/metrics` route on a [Prometheus friendly format](https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md). +A service monitor should be created to instruct Prometheus tools to scrape +the metrics from the Kubefed `metrics` service endpoint. + +However the client-only metrics are not enough, and Kubefed custom metrics have to +be identified and exposed to better understand this engine and scalability challenges. + + +### Metrics + +In the following we share a table with the relevant metrics: + +Kubefed clusters states reflect the status of the cluster and is periodically checked. + + +The following metric aims to register the total number of Kubefed clusters on `ready`, `notready` and `offline` state: + +* `kubefedcluster_total`: a gauge metric that holds the number Kubefed clusters in any of the three possible states. + To identify the type of state, we add a label `state` to this metric with the value of the state. + +In addition to these metrics, we should also store the time this whole operation takes: + +* `cluster_health_status_duration_seconds`: this `histogram` metric holds the duration in seconds of the action that checks +the health status of a Kubefed cluster. + +Kubefed needs to connect to the remote clusters to validate/create/delete all the federated resources +in the target clusters. When having many clusters, the time invested on connecting +to remote clusters might be relevant: + +* `cluster_client_connection_duration_seconds`: this `histogram` metric holds the duration in seconds of the creation +of a Kubernetes client to a remote cluster. This operation normally implies to connect to +the remote server to get certain metadata. + +Kubefed federates resources on target clusters, and one of its controllers triggers +a periodic reconciliation of all target federated resources. + +* `reconcile_federated_resources_duration_seconds`: this `histogram` metric holds the duration in seconds of the action that +reconcile federated resources in the target clusters. + +Another operation that is relevant to record is the creation/update/deletion of +the propagated resources. This action is handled by the called dispatchers in Kubefed. + +For this metric, we could choose a single metric that will include additional labels +to distinguish the different operations: + +* `dispatch_operation_duration_seconds`: this `histogram` metric holds the duration in seconds of the creation/update/deletion +of the different propagated resources. The label `action` will hold the `create`, `update` and `delete` operations. + +Regarding cluster join/unjoin operations, these metrics are also convenient to register: + +* `joinedcluster_total`: a gauge metric that holds the number joined clusters. + +* `join_cluster_duration_seconds`: this `histogram` metric holds the duration in seconds of the join cluster action. + +* `unjoin_cluster_duration_seconds`: this `histogram` metric holds the duration in seconds of the unjoin cluster action. + +To keep track of the rest of controllers and its reconciliation time, we will use a generic metric: + +* `controller_runtime_reconcile_duration_seconds`: is a `histogram` which keeps track of the duration +of reconciliations for other Kubefed controllers. A label `controller` will allow to distinguish +the different controllers. + +In addition to these metrics, we could add counters to register common error types. +This approach would make easy to track their rate on a dashboard. + + +#### Alternatives + +### Implementation Details/Notes/Constraints + +All the identified metrics in this document might be added to Kubefed in an incremental manner. + +### Risks and Mitigations + +## Graduation Criteria + +## Implementation History + +## Drawbacks + +## Infrastructure Needed From e17725d3d8a3cdb16f44a1ad8040836f08951b50 Mon Sep 17 00:00:00 2001 From: Hector Fernandez Date: Sun, 15 Mar 2020 16:42:50 +0100 Subject: [PATCH 4/4] metrics: register times for the reconcile loops --- docs/keps/20200302-kubefed-metrics.md | 4 ++-- pkg/controller/federatedtypeconfig/controller.go | 3 +++ pkg/controller/ingressdns/controller.go | 3 +++ pkg/controller/kubefedcluster/controller.go | 5 ++--- pkg/controller/schedulingmanager/controller.go | 5 +++++ pkg/controller/schedulingpreference/controller.go | 3 +++ pkg/controller/servicedns/controller.go | 3 +++ pkg/controller/status/controller.go | 3 +++ pkg/controller/util/federated_informer.go | 4 +--- pkg/metrics/metrics.go | 2 +- 10 files changed, 26 insertions(+), 9 deletions(-) diff --git a/docs/keps/20200302-kubefed-metrics.md b/docs/keps/20200302-kubefed-metrics.md index 619ac558bd..4fca39d7f0 100644 --- a/docs/keps/20200302-kubefed-metrics.md +++ b/docs/keps/20200302-kubefed-metrics.md @@ -44,7 +44,7 @@ and consumed from Kubefed to create dashboards and better understand this engine ## Motivation We aim to define a generic strategy on how to identify, consume and expose -metrics from our core components in Kommander. +custom Kubefed metrics. ### Goals @@ -115,7 +115,7 @@ of the different propagated resources. The label `action` will hold the `create` Regarding cluster join/unjoin operations, these metrics are also convenient to register: -* `joinedcluster_total`: a gauge metric that holds the number joined clusters. +* `joined_cluster_total`: a gauge metric that holds the number joined clusters. * `join_cluster_duration_seconds`: this `histogram` metric holds the duration in seconds of the join cluster action. diff --git a/pkg/controller/federatedtypeconfig/controller.go b/pkg/controller/federatedtypeconfig/controller.go index aad96c89a8..37e39aa9bf 100644 --- a/pkg/controller/federatedtypeconfig/controller.go +++ b/pkg/controller/federatedtypeconfig/controller.go @@ -19,6 +19,7 @@ package federatedtypeconfig import ( "context" "sync" + "time" "github.com/pkg/errors" @@ -36,6 +37,7 @@ import ( statuscontroller "sigs.k8s.io/kubefed/pkg/controller/status" synccontroller "sigs.k8s.io/kubefed/pkg/controller/sync" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) const finalizer string = "core.kubefed.io/federated-type-config" @@ -128,6 +130,7 @@ func (c *Controller) Run(stopChan <-chan struct{}) { func (c *Controller) reconcile(qualifiedName util.QualifiedName) util.ReconciliationStatus { key := qualifiedName.String() + defer metrics.UpdateControllerReconcileDurationFromStart("federatedtypeconfigcontroller", time.Now()) klog.V(3).Infof("Running reconcile FederatedTypeConfig for %q", key) diff --git a/pkg/controller/ingressdns/controller.go b/pkg/controller/ingressdns/controller.go index 8f8e9658f4..930300189f 100644 --- a/pkg/controller/ingressdns/controller.go +++ b/pkg/controller/ingressdns/controller.go @@ -38,6 +38,7 @@ import ( dnsv1a1 "sigs.k8s.io/kubefed/pkg/apis/multiclusterdns/v1alpha1" genericclient "sigs.k8s.io/kubefed/pkg/client/generic" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) const ( @@ -201,6 +202,8 @@ func (c *Controller) reconcileOnClusterChange() { } func (c *Controller) reconcile(qualifiedName util.QualifiedName) util.ReconciliationStatus { + defer metrics.UpdateControllerReconcileDurationFromStart("ingressdnscontroller", time.Now()) + if !c.isSynced() { return util.StatusNotSynced } diff --git a/pkg/controller/kubefedcluster/controller.go b/pkg/controller/kubefedcluster/controller.go index 3e042e7066..e970c3a39d 100644 --- a/pkg/controller/kubefedcluster/controller.go +++ b/pkg/controller/kubefedcluster/controller.go @@ -241,9 +241,10 @@ func (cc *ClusterController) updateClusterStatus() error { func (cc *ClusterController) updateIndividualClusterStatus(cluster *fedv1b1.KubeFedCluster, storedData *ClusterData, wg *sync.WaitGroup) { + defer metrics.ClusterHealthStatusDurationFromStart(time.Now()) + clusterClient := storedData.clusterKubeClient - clusterHealthStatusStart := time.Now() currentClusterStatus, err := clusterClient.GetClusterHealthStatus() if err != nil { cc.RecordError(cluster, "RetrievingClusterHealthFailed", errors.Wrap(err, "Failed to retrieve health of the cluster")) @@ -261,8 +262,6 @@ func (cc *ClusterController) updateIndividualClusterStatus(cluster *fedv1b1.Kube klog.Warningf("Failed to update the status of cluster %q: %v", cluster.Name, err) } - metrics.ClusterHealthStatusDurationFromStart(clusterHealthStatusStart) - wg.Done() } diff --git a/pkg/controller/schedulingmanager/controller.go b/pkg/controller/schedulingmanager/controller.go index 12373fa6dd..e529281e50 100644 --- a/pkg/controller/schedulingmanager/controller.go +++ b/pkg/controller/schedulingmanager/controller.go @@ -17,6 +17,8 @@ limitations under the License. package schedulingmanager import ( + "time" + "github.com/pkg/errors" "k8s.io/apimachinery/pkg/util/runtime" @@ -27,6 +29,7 @@ import ( corev1b1 "sigs.k8s.io/kubefed/pkg/apis/core/v1beta1" "sigs.k8s.io/kubefed/pkg/controller/schedulingpreference" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" "sigs.k8s.io/kubefed/pkg/schedulingtypes" ) @@ -141,6 +144,8 @@ func (c *SchedulingManager) shutdown() { } func (c *SchedulingManager) reconcile(qualifiedName util.QualifiedName) util.ReconciliationStatus { + defer metrics.UpdateControllerReconcileDurationFromStart("schedulingmanagercontroller", time.Now()) + key := qualifiedName.String() klog.V(3).Infof("Running reconcile FederatedTypeConfig %q in scheduling manager", key) diff --git a/pkg/controller/schedulingpreference/controller.go b/pkg/controller/schedulingpreference/controller.go index 2a9c0a0b03..820870b257 100644 --- a/pkg/controller/schedulingpreference/controller.go +++ b/pkg/controller/schedulingpreference/controller.go @@ -35,6 +35,7 @@ import ( fedv1b1 "sigs.k8s.io/kubefed/pkg/apis/core/v1beta1" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" "sigs.k8s.io/kubefed/pkg/schedulingtypes" ) @@ -191,6 +192,8 @@ func (s *SchedulingPreferenceController) reconcileOnClusterChange() { } func (s *SchedulingPreferenceController) reconcile(qualifiedName util.QualifiedName) util.ReconciliationStatus { + defer metrics.UpdateControllerReconcileDurationFromStart("schedulingpreferencecontroller", time.Now()) + if !s.isSynced() { return util.StatusNotSynced } diff --git a/pkg/controller/servicedns/controller.go b/pkg/controller/servicedns/controller.go index 019ea6cdb2..ac77f8a88d 100644 --- a/pkg/controller/servicedns/controller.go +++ b/pkg/controller/servicedns/controller.go @@ -37,6 +37,7 @@ import ( dnsv1a1 "sigs.k8s.io/kubefed/pkg/apis/multiclusterdns/v1alpha1" genericclient "sigs.k8s.io/kubefed/pkg/client/generic" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) const ( @@ -257,6 +258,8 @@ func (c *Controller) reconcileOnClusterChange() { } func (c *Controller) reconcile(qualifiedName util.QualifiedName) util.ReconciliationStatus { + defer metrics.UpdateControllerReconcileDurationFromStart("servicednscontroller", time.Now()) + if !c.isSynced() { return util.StatusNotSynced } diff --git a/pkg/controller/status/controller.go b/pkg/controller/status/controller.go index 6176dd138b..f8bd25c543 100644 --- a/pkg/controller/status/controller.go +++ b/pkg/controller/status/controller.go @@ -37,6 +37,7 @@ import ( fedv1b1 "sigs.k8s.io/kubefed/pkg/apis/core/v1beta1" genericclient "sigs.k8s.io/kubefed/pkg/client/generic" "sigs.k8s.io/kubefed/pkg/controller/util" + "sigs.k8s.io/kubefed/pkg/metrics" ) const ( @@ -230,6 +231,8 @@ func (s *KubeFedStatusController) reconcileOnClusterChange() { } func (s *KubeFedStatusController) reconcile(qualifiedName util.QualifiedName) util.ReconciliationStatus { + defer metrics.UpdateControllerReconcileDurationFromStart("statuscontroller", time.Now()) + if !s.isSynced() { return util.StatusNotSynced } diff --git a/pkg/controller/util/federated_informer.go b/pkg/controller/util/federated_informer.go index 363ea9eb92..b6c713e4dc 100644 --- a/pkg/controller/util/federated_informer.go +++ b/pkg/controller/util/federated_informer.go @@ -322,11 +322,10 @@ func (f *federatedInformerImpl) Start() { // GetClientForCluster returns a client for the cluster, if present. func (f *federatedInformerImpl) GetClientForCluster(clusterName string) (generic.Client, error) { + defer metrics.ClusterClientConnectionDurationFromStart(time.Now()) f.Lock() defer f.Unlock() - clientConnectionStart := time.Now() - // return cached client if one exists (to prevent frequent secret retrieval and rest discovery) if client, ok := f.clusterClients[clusterName]; ok { return client, nil @@ -341,7 +340,6 @@ func (f *federatedInformerImpl) GetClientForCluster(clusterName string) (generic } f.clusterClients[clusterName] = client - metrics.ClusterClientConnectionDurationFromStart(clientConnectionStart) return client, nil } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 340f475b1b..3af97d6bfd 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -34,7 +34,7 @@ var ( joinedClusterTotal = prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "joinedcluster_total", + Name: "joined_cluster_total", Help: "Number of total joined clusters.", }, )