From e98031a6111fd191b2d9d8f34ecceedfcd1a8906 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Mon, 24 Jun 2024 09:39:58 -0700 Subject: [PATCH] Enable additional flyte components prometheus metrics (#324) Some flyte components were lacking metrics. clustersyncresources is especially necessary now that new projects are created frequently in serverless. But we should be monitoring all internal services to some extent. * Update clusterresourcesync to export metrics via httpendpoint * Update clusterresourcesync deployment with Prometheus scrape annotations, disabled by default * Update propeller deployment with Prometheus scrape annotations, disabled by default * Update webhook deployment with Prometheus scrape annotations, disabled by default * Create Webhook HPA, default disabled. Changes are intended to be forward-compatible, specifically with Prometheus Pod and ServiceMonitor. towards PE-912 towards PE-846 --- charts/flyte-core/README.md | 19 ++++++++++- charts/flyte-core/templates/_helpers.tpl | 1 - .../clusterresourcesync/deployment.yaml | 11 ++++++ .../templates/propeller/deployment.yaml | 10 ++++++ .../templates/propeller/webhook-hpa.yaml | 17 ++++++++++ .../templates/propeller/webhook.yaml | 10 ++++++ charts/flyte-core/values.yaml | 34 +++++++++++++++++++ .../flyte_aws_scheduler_helm_generated.yaml | 6 ++++ .../flyte_helm_controlplane_generated.yaml | 2 ++ .../eks/flyte_helm_dataplane_generated.yaml | 4 +++ deployment/eks/flyte_helm_generated.yaml | 6 ++++ .../flyte_helm_controlplane_generated.yaml | 2 ++ .../gcp/flyte_helm_dataplane_generated.yaml | 4 +++ deployment/gcp/flyte_helm_generated.yaml | 6 ++++ deployment/sandbox/flyte_helm_generated.yaml | 6 ++++ .../manifests/complete-agent.yaml | 4 +-- .../sandbox-bundled/manifests/complete.yaml | 4 +-- docker/sandbox-bundled/manifests/dev.yaml | 4 +-- flyteadmin/cmd/entrypoints/clusterresource.go | 12 +++++++ flyteadmin/pkg/clusterresource/controller.go | 5 ++- 20 files changed, 158 insertions(+), 9 deletions(-) create mode 100644 charts/flyte-core/templates/propeller/webhook-hpa.yaml diff --git a/charts/flyte-core/README.md b/charts/flyte-core/README.md index 4662eeef66a..41ce2af6edc 100644 --- a/charts/flyte-core/README.md +++ b/charts/flyte-core/README.md @@ -60,7 +60,7 @@ helm install gateway bitnami/contour -n flyte | cloud_events.eventsPublisher.eventTypes[0] | string | `"all"` | | | cloud_events.eventsPublisher.topicName | string | `"arn:aws:sns:us-east-2:123456:123-my-topic"` | | | cloud_events.type | string | `"aws"` | | -| cluster_resource_manager | object | `{"config":{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}},"enabled":true,"nodeSelector":{},"podAnnotations":{},"podEnv":{},"podLabels":{},"resources":{},"service_account_name":"flyteadmin","standaloneDeployment":false,"templates":[{"key":"aa_namespace","value":"apiVersion: v1\nkind: Namespace\nmetadata:\n name: {{ namespace }}\nspec:\n finalizers:\n - kubernetes\n"},{"key":"ab_project_resource_quota","value":"apiVersion: v1\nkind: ResourceQuota\nmetadata:\n name: project-quota\n namespace: {{ namespace }}\nspec:\n hard:\n limits.cpu: {{ projectQuotaCpu }}\n limits.memory: {{ projectQuotaMemory }}\n"}]}` | Configuration for the Cluster resource manager component. This is an optional component, that enables automatic cluster configuration. This is useful to set default quotas, manage namespaces etc that map to a project/domain | +| cluster_resource_manager | object | `{"config":{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}},"enabled":true,"nodeSelector":{},"podAnnotations":{},"podEnv":{},"podLabels":{},"prometheus":{"enabled":false,"path":"/metrics","port":10254},"resources":{},"service_account_name":"flyteadmin","standaloneDeployment":false,"templates":[{"key":"aa_namespace","value":"apiVersion: v1\nkind: Namespace\nmetadata:\n name: {{ namespace }}\nspec:\n finalizers:\n - kubernetes\n"},{"key":"ab_project_resource_quota","value":"apiVersion: v1\nkind: ResourceQuota\nmetadata:\n name: project-quota\n namespace: {{ namespace }}\nspec:\n hard:\n limits.cpu: {{ projectQuotaCpu }}\n limits.memory: {{ projectQuotaMemory }}\n"}]}` | Configuration for the Cluster resource manager component. This is an optional component, that enables automatic cluster configuration. This is useful to set default quotas, manage namespaces etc that map to a project/domain | | cluster_resource_manager.config | object | `{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}}` | Configmap for ClusterResource parameters | | cluster_resource_manager.config.cluster_resources | object | `{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}` | ClusterResource parameters Refer to the [structure](https://pkg.go.dev/github.com/lyft/flyteadmin@v0.3.37/pkg/runtime/interfaces#ClusterResourceConfig) to customize. | | cluster_resource_manager.config.cluster_resources.refreshInterval | string | `"5m"` | How frequently to run the sync process | @@ -238,6 +238,9 @@ helm install gateway bitnami/contour -n flyte | flytepropeller.podEnv | object | `{}` | Additional Flytepropeller container environment variables | | flytepropeller.podLabels | object | `{}` | Labels for Flytepropeller pods | | flytepropeller.priorityClassName | string | `""` | Sets priorityClassName for propeller pod(s). | +| flytepropeller.prometheus.enabled | bool | `false` | | +| flytepropeller.prometheus.path | string | `"/metrics"` | | +| flytepropeller.prometheus.port | int | `10254` | | | flytepropeller.replicaCount | int | `1` | Replicas count for Flytepropeller deployment | | flytepropeller.resources | object | `{"limits":{"cpu":"200m","ephemeral-storage":"100Mi","memory":"200Mi"},"requests":{"cpu":"10m","ephemeral-storage":"50Mi","memory":"100Mi"}}` | Default resources requests and limits for Flytepropeller deployment | | flytepropeller.securityContext | object | `{"fsGroup":65534,"fsGroupChangePolicy":"Always","runAsUser":1001}` | Sets securityContext for flytepropeller pod(s). | @@ -295,8 +298,22 @@ helm install gateway bitnami/contour -n flyte | storage.s3.authType | string | `"iam"` | type of authentication to use for S3 buckets, can either be iam or accesskey | | storage.s3.secretKey | string | `""` | AWS IAM user secret access key to use for S3 bucket auth, only used if authType is set to accesskey | | storage.type | string | `"sandbox"` | Sets the storage type. Supported values are sandbox, s3, gcs and custom. | +| webhook.autoscaling.enabled | bool | `false` | | +| webhook.autoscaling.maxReplicas | int | `10` | | +| webhook.autoscaling.metrics[0].resource.name | string | `"cpu"` | | +| webhook.autoscaling.metrics[0].resource.target.averageUtilization | int | `80` | | +| webhook.autoscaling.metrics[0].resource.target.type | string | `"Utilization"` | | +| webhook.autoscaling.metrics[0].type | string | `"Resource"` | | +| webhook.autoscaling.metrics[1].resource.name | string | `"memory"` | | +| webhook.autoscaling.metrics[1].resource.target.averageUtilization | int | `80` | | +| webhook.autoscaling.metrics[1].resource.target.type | string | `"Utilization"` | | +| webhook.autoscaling.metrics[1].type | string | `"Resource"` | | +| webhook.autoscaling.minReplicas | int | `1` | | | webhook.enabled | bool | `true` | enable or disable secrets webhook | | webhook.priorityClassName | string | `""` | Sets priorityClassName for webhook pod | +| webhook.prometheus.enabled | bool | `false` | | +| webhook.prometheus.path | string | `"/metrics"` | | +| webhook.prometheus.port | int | `10254` | | | webhook.resources.requests.cpu | string | `"200m"` | | | webhook.resources.requests.ephemeral-storage | string | `"500Mi"` | | | webhook.resources.requests.memory | string | `"500Mi"` | | diff --git a/charts/flyte-core/templates/_helpers.tpl b/charts/flyte-core/templates/_helpers.tpl index b4361a1e470..f7b50c0b29d 100755 --- a/charts/flyte-core/templates/_helpers.tpl +++ b/charts/flyte-core/templates/_helpers.tpl @@ -79,7 +79,6 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{ toYaml . }} {{- end }} {{- end -}} - {{- define "datacatalog.name" -}} datacatalog {{- end -}} diff --git a/charts/flyte-core/templates/clusterresourcesync/deployment.yaml b/charts/flyte-core/templates/clusterresourcesync/deployment.yaml index a2fb5d04ae0..9108a0b3354 100644 --- a/charts/flyte-core/templates/clusterresourcesync/deployment.yaml +++ b/charts/flyte-core/templates/clusterresourcesync/deployment.yaml @@ -16,6 +16,11 @@ spec: {{- with .Values.cluster_resource_manager.podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} + prometheus.io/path: {{ .Values.cluster_resource_manager.prometheus.path | quote }} + prometheus.io/port: {{ .Values.cluster_resource_manager.prometheus.port | quote }} + {{- with .Values.cluster_resource_manager.prometheus.enabled }} + prometheus.io/scrape: "true" + {{- end }} labels: {{ include "flyteclusterresourcesync.podLabels" . | nindent 8 }} spec: containers: @@ -55,6 +60,12 @@ spec: - mountPath: /var/run/credentials name: cluster-secrets {{- end }} + {{- if .Values.cluster_resource_manager.prometheus.enabled }} + ports: + - containerPort: {{ .Values.cluster_resource_manager.prometheus.port }} + name: debug + protocol: TCP + {{- end }} serviceAccountName: {{ .Values.cluster_resource_manager.service_account_name }} volumes: {{- include "databaseSecret.volume" . | nindent 8 }} - configMap: diff --git a/charts/flyte-core/templates/propeller/deployment.yaml b/charts/flyte-core/templates/propeller/deployment.yaml index 5fd09e5d5da..9488dbf8a57 100644 --- a/charts/flyte-core/templates/propeller/deployment.yaml +++ b/charts/flyte-core/templates/propeller/deployment.yaml @@ -25,6 +25,11 @@ spec: {{- with .Values.flytepropeller.podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} + prometheus.io/path: {{ .Values.flytepropeller.prometheus.path | quote }} + prometheus.io/port: {{ .Values.flytepropeller.prometheus.port | quote }} + {{- with .Values.flytepropeller.prometheus.enabled }} + prometheus.io/scrape: "true" + {{- end }} {{- if .Values.flytepropeller.manager }} labels: {{ include "flytepropeller-manager.podLabels" . | nindent 8 }} {{- else }} @@ -78,6 +83,11 @@ spec: {{- end }} ports: - containerPort: {{ index .Values.configmap.core.propeller "prof-port" }} + {{- if .Values.flytepropeller.prometheus.enabled }} + - containerPort: {{ .Values.flytepropeller.prometheus.port }} + name: debug + protocol: TCP + {{- end }} resources: {{- toYaml .Values.flytepropeller.resources | nindent 10 }} volumeMounts: - name: config-volume diff --git a/charts/flyte-core/templates/propeller/webhook-hpa.yaml b/charts/flyte-core/templates/propeller/webhook-hpa.yaml new file mode 100644 index 00000000000..9562287fe71 --- /dev/null +++ b/charts/flyte-core/templates/propeller/webhook-hpa.yaml @@ -0,0 +1,17 @@ +{{- if .Values.webhook.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ template "flyte-pod-webhook.name" . }} + labels: + app: {{ template "flyte-pod-webhook.name" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ template "flyte-pod-webhook.name" . }} + minReplicas: {{ .Values.webhook.autoscaling.minReplicas }} + maxReplicas: {{ .Values.webhook.autoscaling.maxReplicas }} + metrics: + {{ .Values.webhook.autoscaling.metrics | toYaml | nindent 4 }} +{{- end }} diff --git a/charts/flyte-core/templates/propeller/webhook.yaml b/charts/flyte-core/templates/propeller/webhook.yaml index 90241a69f80..89757eff7c1 100644 --- a/charts/flyte-core/templates/propeller/webhook.yaml +++ b/charts/flyte-core/templates/propeller/webhook.yaml @@ -34,6 +34,11 @@ spec: {{- with .Values.flytepropeller.podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} + prometheus.io/path: {{ .Values.webhook.prometheus.path | quote }} + prometheus.io/port: {{ .Values.webhook.prometheus.port | quote }} + {{- with .Values.webhook.prometheus.enabled }} + prometheus.io/scrape: "true" + {{- end }} spec: {{- with .Values.webhook.securityContext }} securityContext: {{ tpl (toYaml .) $ | nindent 8 }} @@ -102,6 +107,11 @@ spec: {{- end }} ports: - containerPort: 9443 + {{- if .Values.webhook.prometheus.enabled }} + - containerPort: {{ .Values.webhook.prometheus.port }} + name: debug + protocol: TCP + {{- end }} securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/charts/flyte-core/values.yaml b/charts/flyte-core/values.yaml index b7f390a45e2..34693250579 100755 --- a/charts/flyte-core/values.yaml +++ b/charts/flyte-core/values.yaml @@ -383,6 +383,12 @@ flytepropeller: interval: 60s # -- Sets the timeout after which request to scrape metrics will time out scrapeTimeout: 30s + + prometheus: + enabled: false + path: "/metrics" + port: 10254 + # # FLYTECONSOLE SETTINGS # @@ -492,6 +498,29 @@ webhook: ephemeral-storage: 500Mi memory: 500Mi + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + + prometheus: + enabled: false + path: "/metrics" + port: 10254 + # ------------------------------------------------ # # COMMON SETTINGS @@ -951,6 +980,11 @@ cluster_resource_manager: - projectQuotaMemory: value: "3000Mi" + prometheus: + enabled: false + path: "/metrics" + port: 10254 + # -- Resource templates that should be applied templates: # -- Template for namespaces resources diff --git a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml index fadb80927ff..2bf9fa2cfe3 100644 --- a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml +++ b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml @@ -1045,6 +1045,8 @@ spec: metadata: annotations: configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte @@ -1270,6 +1272,8 @@ spec: metadata: annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -1352,6 +1356,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/eks/flyte_helm_controlplane_generated.yaml b/deployment/eks/flyte_helm_controlplane_generated.yaml index f110eda1516..982ed24d037 100644 --- a/deployment/eks/flyte_helm_controlplane_generated.yaml +++ b/deployment/eks/flyte_helm_controlplane_generated.yaml @@ -750,6 +750,8 @@ spec: metadata: annotations: configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte diff --git a/deployment/eks/flyte_helm_dataplane_generated.yaml b/deployment/eks/flyte_helm_dataplane_generated.yaml index f7b146dd9f1..8d17d712796 100644 --- a/deployment/eks/flyte_helm_dataplane_generated.yaml +++ b/deployment/eks/flyte_helm_dataplane_generated.yaml @@ -429,6 +429,8 @@ spec: metadata: annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -511,6 +513,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/eks/flyte_helm_generated.yaml b/deployment/eks/flyte_helm_generated.yaml index 4aba6e7f3db..9b2ce0aea31 100644 --- a/deployment/eks/flyte_helm_generated.yaml +++ b/deployment/eks/flyte_helm_generated.yaml @@ -1076,6 +1076,8 @@ spec: metadata: annotations: configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte @@ -1400,6 +1402,8 @@ spec: metadata: annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -1482,6 +1486,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/gcp/flyte_helm_controlplane_generated.yaml b/deployment/gcp/flyte_helm_controlplane_generated.yaml index b227a9c0d0e..02c6ef5edf8 100644 --- a/deployment/gcp/flyte_helm_controlplane_generated.yaml +++ b/deployment/gcp/flyte_helm_controlplane_generated.yaml @@ -765,6 +765,8 @@ spec: metadata: annotations: configChecksum: "dc18f5d54e0770c574e6b0693724047e22063030259104eebb554398d63209f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte diff --git a/deployment/gcp/flyte_helm_dataplane_generated.yaml b/deployment/gcp/flyte_helm_dataplane_generated.yaml index 0da528443fe..e89bde15bf0 100644 --- a/deployment/gcp/flyte_helm_dataplane_generated.yaml +++ b/deployment/gcp/flyte_helm_dataplane_generated.yaml @@ -437,6 +437,8 @@ spec: metadata: annotations: configChecksum: "2a962c9fcb8a58e835ea829883300ae11e9124b9972c5e1fe29e1cc283dd2f9" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -518,6 +520,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "2a962c9fcb8a58e835ea829883300ae11e9124b9972c5e1fe29e1cc283dd2f9" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/gcp/flyte_helm_generated.yaml b/deployment/gcp/flyte_helm_generated.yaml index 6df3f312256..5fe9dcdaaa5 100644 --- a/deployment/gcp/flyte_helm_generated.yaml +++ b/deployment/gcp/flyte_helm_generated.yaml @@ -1099,6 +1099,8 @@ spec: metadata: annotations: configChecksum: "dc18f5d54e0770c574e6b0693724047e22063030259104eebb554398d63209f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte @@ -1423,6 +1425,8 @@ spec: metadata: annotations: configChecksum: "2a962c9fcb8a58e835ea829883300ae11e9124b9972c5e1fe29e1cc283dd2f9" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -1504,6 +1508,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "2a962c9fcb8a58e835ea829883300ae11e9124b9972c5e1fe29e1cc283dd2f9" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/sandbox/flyte_helm_generated.yaml b/deployment/sandbox/flyte_helm_generated.yaml index f10c4cfdb3a..c984ddac6c0 100644 --- a/deployment/sandbox/flyte_helm_generated.yaml +++ b/deployment/sandbox/flyte_helm_generated.yaml @@ -6870,6 +6870,8 @@ spec: metadata: annotations: configChecksum: "475154c41cdb06999025ab796aa1264fa3d235df51ac088a39c89c7ce300408" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte @@ -7174,6 +7176,8 @@ spec: metadata: annotations: configChecksum: "f892b909c52752746c1b17c780ae5733f70d8c731acc9a89c31361c5690c8a5" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -7248,6 +7252,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "f892b909c52752746c1b17c780ae5733f70d8c731acc9a89c31361c5690c8a5" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/docker/sandbox-bundled/manifests/complete-agent.yaml b/docker/sandbox-bundled/manifests/complete-agent.yaml index 0a18e42b33e..04d26d6440b 100644 --- a/docker/sandbox-bundled/manifests/complete-agent.yaml +++ b/docker/sandbox-bundled/manifests/complete-agent.yaml @@ -816,7 +816,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: eHMzc0p4bTZoSVVFb1V0Uw== + haSharedSecret: MWZZMXJJcWFNeW5pVG9vRg== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1412,7 +1412,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 174761d8b2609550a723b808cf0807c0b29cd4d3e1050ee85178a46fbca1a61c + checksum/secret: b0052aa89fd923e4be52d976ca724f568545dcab728e5f921f20c882546abfb3 labels: app: docker-registry release: flyte-sandbox diff --git a/docker/sandbox-bundled/manifests/complete.yaml b/docker/sandbox-bundled/manifests/complete.yaml index 2bdaa4bb4a2..e17a424ef94 100644 --- a/docker/sandbox-bundled/manifests/complete.yaml +++ b/docker/sandbox-bundled/manifests/complete.yaml @@ -796,7 +796,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: OFdhR0JObmY4TkFWd1JaMg== + haSharedSecret: a0xtWmJJU2tyNUVMT3NqbQ== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1360,7 +1360,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 71ade407a1350a0e7ee684b637e0a0d15cf511e83431b5466448e1a79da1d275 + checksum/secret: de6b9493b8ac7022ecaf53f7bbd447a8f74e594e429e4c49b9fa346458c7a80d labels: app: docker-registry release: flyte-sandbox diff --git a/docker/sandbox-bundled/manifests/dev.yaml b/docker/sandbox-bundled/manifests/dev.yaml index 0ea904cd281..ef69639c32a 100644 --- a/docker/sandbox-bundled/manifests/dev.yaml +++ b/docker/sandbox-bundled/manifests/dev.yaml @@ -499,7 +499,7 @@ metadata: --- apiVersion: v1 data: - haSharedSecret: b1k1Q0xLa2hkR3doaG9NMg== + haSharedSecret: VENNTk91RkVPTWdUa3V3VA== proxyPassword: "" proxyUsername: "" kind: Secret @@ -934,7 +934,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 7d2c67983dc470228224d912a242b2abcc695e391dc54f755e95dd5b820c7215 + checksum/secret: 7fa86dc213b7e243519854ad34d0f779e2e48304da9cd8db5f019c52f958302e labels: app: docker-registry release: flyte-sandbox diff --git a/flyteadmin/cmd/entrypoints/clusterresource.go b/flyteadmin/cmd/entrypoints/clusterresource.go index bb47d8775f0..578d92868e9 100644 --- a/flyteadmin/cmd/entrypoints/clusterresource.go +++ b/flyteadmin/cmd/entrypoints/clusterresource.go @@ -10,6 +10,7 @@ import ( "github.com/flyteorg/flyte/flyteadmin/pkg/clusterresource" "github.com/flyteorg/flyte/flyteadmin/pkg/runtime" "github.com/flyteorg/flyte/flytestdlib/logger" + "github.com/flyteorg/flyte/flytestdlib/profutils" "github.com/flyteorg/flyte/flytestdlib/promutils" ) @@ -29,6 +30,17 @@ var controllerRunCmd = &cobra.Command{ if err != nil { return err } + + // Serve profiling endpoints. + cfg := runtime.NewConfigurationProvider() + go func() { + err := profutils.StartProfilingServerWithDefaultHandlers( + ctx, cfg.ApplicationConfiguration().GetTopLevelConfig().GetProfilerPort(), nil) + if err != nil { + logger.Panicf(ctx, "Failed to Start profiling and Metrics server. Error, %v", err) + } + }() + clusterResourceController.Run() logger.Infof(ctx, "ClusterResourceController started running successfully") return nil diff --git a/flyteadmin/pkg/clusterresource/controller.go b/flyteadmin/pkg/clusterresource/controller.go index cdf9ea06557..daad2600e8e 100644 --- a/flyteadmin/pkg/clusterresource/controller.go +++ b/flyteadmin/pkg/clusterresource/controller.go @@ -69,6 +69,7 @@ type Controller interface { type controllerMetrics struct { Scope promutils.Scope + SyncErrors prometheus.Counter SyncStarted prometheus.Counter KubernetesResourcesCreated prometheus.Counter KubernetesResourcesCreateErrors prometheus.Counter @@ -615,6 +616,7 @@ func (c *controller) Sync(ctx context.Context) error { logger.Infof(ctx, "Completed cluster resource creation loop with stats: [%+v]", stats) if len(errs) > 0 { + c.metrics.SyncErrors.Add(float64(len(errs))) return errors.NewCollectedFlyteAdminError(codes.Internal, errs) } @@ -637,7 +639,8 @@ func (c *controller) Run() { func newMetrics(scope promutils.Scope) controllerMetrics { return controllerMetrics{ - Scope: scope, + Scope: scope, + SyncErrors: scope.MustNewCounter("sync_errors", "overall count of errors that occurred within a 'sync' method"), SyncStarted: scope.MustNewCounter("k8s_resource_syncs", "overall count of the number of invocations of the resource controller 'sync' method"), KubernetesResourcesCreated: scope.MustNewCounter("k8s_resources_created",