Adding worker autoscaling support with KEDA

trinodb · Jan 11, 2025 · 30756e8 · 30756e8
1 parent 424580a
commit 30756e8
Show file tree

Hide file tree

Showing 9 changed files with 233 additions and 19 deletions.
diff --git a/charts/trino/README.md b/charts/trino/README.md
@@ -78,8 +78,9 @@ Fast distributed SQL query engine for big data analytics that helps you explore
   ```
 * `server.workerExtraConfig` - string, default: `""`
 * `server.coordinatorExtraConfig` - string, default: `""`
-* `server.autoscaling.enabled` - bool, default: `false`
-* `server.autoscaling.maxReplicas` - int, default: `5`
+* `server.autoscaling` - object, default: `{"behavior":{},"enabled":false,"maxReplicas":5,"targetCPUUtilizationPercentage":50,"targetMemoryUtilizationPercentage":80}`  
+
+  Configure [Horizontal Pod Autoscaling](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) for workers (`server.keda.enabled` must be `false`).
 * `server.autoscaling.targetCPUUtilizationPercentage` - int, default: `50`  
 
   Target average CPU utilization, represented as a percentage of requested CPU. To disable scaling based on CPU, set to an empty string.
@@ -108,6 +109,70 @@ Fast distributed SQL query engine for big data analytics that helps you explore
        periodSeconds: 15
      selectPolicy: Max
   ```
+* `server.keda` - object, default: `{"advanced":{},"annotations":{},"cooldownPeriod":300,"enabled":false,"fallback":{},"initialCooldownPeriod":0,"maxReplicaCount":5,"minReplicaCount":0,"pollingInterval":30,"triggers":[]}`  
+
+  Configure [Kubernetes Event-driven Autoscaling](https://keda.sh/) for workers (`server.autoscaling.enabled` must be `false`).
+* `server.keda.cooldownPeriod` - int, default: `300`  
+
+  Period (in seconds) to wait after the last trigger reported active before scaling the resource back to 0
+* `server.keda.initialCooldownPeriod` - int, default: `0`  
+
+  The delay (in seconds) before the `cooldownPeriod` starts after the initial creation of the `ScaledObject`.
+* `server.keda.minReplicaCount` - int, default: `0`  
+
+  Minimum number of replicas KEDA will scale the resource down to. By default, it’s scale to zero, but you can use it with some other value as well.
+* `server.keda.maxReplicaCount` - int, default: `5`  
+
+  This setting is passed to the HPA definition that KEDA will create for a given resource and holds the maximum number of replicas of the target resource.
+* `server.keda.fallback` - object, default: `{}`  
+
+  Defines a number of replicas to fall back to if a scaler is in an error state.
+  Example:
+  ```yaml
+  fallback:             # Optional. Section to specify fallback options
+    failureThreshold: 3 # Mandatory if fallback section is included
+    replicas: 6         # Mandatory if fallback section is included
+  ```
+* `server.keda.advanced` - object, default: `{}`  
+
+  Specifies HPA related options
+  Example:
+  ```yaml
+  advanced:
+    horizontalPodAutoscalerConfig:
+      behavior:
+        scaleDown:
+          stabilizationWindowSeconds: 300
+          policies:
+            - type: Percent
+              value: 100
+              periodSeconds: 15
+  ```
+* `server.keda.triggers` - list, default: `[]`  
+
+  List of triggers to activate scaling of the target resource
+  Example:
+  ```yaml
+  triggers:
+    - type: prometheus
+      metricType: Value
+      metadata:
+        serverAddress: "http://prometheus.example.com"
+        threshold: "1"
+        metricName: required_workers
+          query: >-
+            sum by (service)
+            (avg_over_time(trino_execution_ClusterSizeMonitor_RequiredWorkers{service={{ include "trino.fullname" . | quote }}}[5s]))
+  ```
+* `server.keda.annotations` - object, default: `{}`  
+
+  Annotations to apply to the ScaledObject CRD.
+  Example:
+  ```yaml
+  annotations:
+    autoscaling.keda.sh/paused-replicas: "0"
+    autoscaling.keda.sh/paused: "true"
+  ```
 * `accessControl` - object, default: `{}`  
 
   [System access control](https://trino.io/docs/current/security/built-in-system-access-control.html) configuration.
@@ -435,6 +500,9 @@ Fast distributed SQL query engine for big data analytics that helps you explore
 * `coordinator.jvm.gcMethod.type` - string, default: `"UseG1GC"`
 * `coordinator.jvm.gcMethod.g1.heapRegionSize` - string, default: `"32M"`
 * `coordinator.config.memory.heapHeadroomPerNode` - string, default: `""`
+* `coordinator.config.nodeScheduler.includeCoordinator` - bool, default: `false`  
+
+  Allows scheduling work on the coordinator so that a single machine can function as both coordinator and worker. For large clusters, processing work on the coordinator can negatively impact query performance because the machine's resources are not available for the critical coordinator tasks of scheduling, managing, and monitoring query execution.
 * `coordinator.config.query.maxMemoryPerNode` - string, default: `"1GB"`
 * `coordinator.additionalJVMConfig` - list, default: `[]`
 * `coordinator.additionalExposedPorts` - object, default: `{}`  

diff --git a/charts/trino/templates/configmap-coordinator.yaml b/charts/trino/templates/configmap-coordinator.yaml
@@ -50,11 +50,7 @@ data:
 
   config.properties: |
     coordinator=true
-    {{- if gt (int .Values.server.workers) 0 }}
-    node-scheduler.include-coordinator=false
-    {{- else }}
-    node-scheduler.include-coordinator=true
-    {{- end }}
+    node-scheduler.include-coordinator={{ .Values.coordinator.config.nodeScheduler.includeCoordinator }}
     http-server.http.port={{ .Values.service.port }}
     query.max-memory={{ .Values.server.config.query.maxMemory }}
     query.max-memory-per-node={{ .Values.coordinator.config.query.maxMemoryPerNode }}

diff --git a/charts/trino/templates/configmap-worker.yaml b/charts/trino/templates/configmap-worker.yaml
@@ -1,5 +1,5 @@
 {{- $workerJmx := merge .Values.jmx.worker (omit .Values.jmx "coordinator" "worker") -}}
-{{- if gt (int .Values.server.workers) 0 }}
+{{- if or .Values.server.keda.enabled (gt (int .Values.server.workers) 0) }}
 apiVersion: v1
 kind: ConfigMap
 metadata:

diff --git a/charts/trino/templates/deployment-worker.yaml b/charts/trino/templates/deployment-worker.yaml
@@ -1,5 +1,5 @@
 {{- $workerJmx := merge .Values.jmx.worker (omit .Values.jmx "coordinator" "worker") -}}
-{{- if gt (int .Values.server.workers) 0 }}
+{{- if or .Values.server.keda.enabled (gt (int .Values.server.workers) 0) }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -19,7 +19,7 @@ spec:
   revisionHistoryLimit: {{ .Values.worker.deployment.revisionHistoryLimit }}
   strategy:
     {{- toYaml .Values.worker.deployment.strategy | nindent 4 }}
-  {{- if not .Values.server.autoscaling.enabled }}
+  {{- if and (not .Values.server.autoscaling.enabled) (not .Values.server.keda.enabled) }}
   replicas: {{ .Values.server.workers }}
   {{- end }}
   selector:

diff --git a/charts/trino/templates/keda-scaledobject.yaml b/charts/trino/templates/keda-scaledobject.yaml
@@ -0,0 +1,40 @@
+{{- if .Values.server.keda.enabled }}
+{{- if .Values.server.autoscaling.enabled }}
+{{- fail "The worker Kubernetes Event-driven Autoscaling configuration (`server.keda`) conflicts with the worker Horizontal Pod Autoscaling (`server.autoscaling`). They cannot be both enabled at the same time!" }}
+{{- end }}
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: {{ template "trino.worker" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "trino.labels" . | nindent 4 }}
+  {{- with .Values.server.keda.annotations }}
+  annotations:
+    {{- . | nindent 4 }}
+  {{- end }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ template "trino.worker" . }}
+  pollingInterval: {{ .Values.server.keda.pollingInterval }}
+  cooldownPeriod: {{ .Values.server.keda.cooldownPeriod }}
+  initialCooldownPeriod: {{ .Values.server.keda.initialCooldownPeriod }}
+  minReplicaCount: {{ .Values.server.keda.minReplicaCount }}
+  maxReplicaCount: {{ .Values.server.keda.maxReplicaCount }}
+  {{- with .Values.server.keda.fallback }}
+  fallback:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  {{- with .Values.server.keda.advanced }}
+  advanced:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  {{- with .Values.server.keda.triggers }}
+  triggers:
+    {{- tpl (toYaml .) $ | nindent 4 }}
+  {{- else }}
+    {{- fail "At least one element in `.Values.server.keda.triggers` is required!" }}
+  {{- end }}
+{{- end }}
diff --git a/charts/trino/templates/tests/test-connection.yaml b/charts/trino/templates/tests/test-connection.yaml
@@ -42,7 +42,7 @@ spec:
       - --password
     {{- end }}
       - --debug
-      - --execute=SELECT 1
+      - --execute=SELECT COUNT(*) FROM tpch.tiny.nation
       - --no-progress
     {{- if eq .Values.server.config.authenticationType "PASSWORD" }}
       env:

diff --git a/charts/trino/values.yaml b/charts/trino/values.yaml
@@ -82,6 +82,8 @@ server:
 
   workerExtraConfig: ""
   coordinatorExtraConfig: ""
+  # server.autoscaling -- Configure [Horizontal Pod Autoscaling](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
+  # for workers (`server.keda.enabled` must be `false`).
   autoscaling:
     enabled: false
     maxReplicas: 5
@@ -114,6 +116,71 @@ server:
     #    selectPolicy: Max
     # ```
 
+  # server.keda -- Configure [Kubernetes Event-driven Autoscaling](https://keda.sh/) for workers
+  # (`server.autoscaling.enabled` must be `false`).
+  keda:
+    enabled: false
+    pollingInterval: 30
+    # -- Period (in seconds) to wait after the last trigger reported active before scaling the resource back to 0
+    cooldownPeriod: 300
+    # -- The delay (in seconds) before the `cooldownPeriod` starts after the initial creation of the `ScaledObject`.
+    initialCooldownPeriod: 0
+    # -- Minimum number of replicas KEDA will scale the resource down to.
+    # By default, it’s scale to zero, but you can use it with some other value as well.
+    minReplicaCount: 0
+    # -- This setting is passed to the HPA definition that KEDA will create for a given resource and
+    # holds the maximum number of replicas of the target resource.
+    maxReplicaCount: 5
+    fallback: {}
+    # server.keda.fallback -- Defines a number of replicas to fall back to if a scaler is in an error state.
+    # @raw
+    # Example:
+    # ```yaml
+    # fallback:             # Optional. Section to specify fallback options
+    #   failureThreshold: 3 # Mandatory if fallback section is included
+    #   replicas: 6         # Mandatory if fallback section is included
+    # ```
+    advanced: {}
+    # server.keda.advanced -- Specifies HPA related options
+    # @raw
+    # Example:
+    # ```yaml
+    # advanced:
+    #   horizontalPodAutoscalerConfig:
+    #     behavior:
+    #       scaleDown:
+    #         stabilizationWindowSeconds: 300
+    #         policies:
+    #           - type: Percent
+    #             value: 100
+    #             periodSeconds: 15
+    # ```
+    triggers: []
+    # server.keda.triggers -- List of triggers to activate scaling of the target resource
+    # @raw
+    # Example:
+    # ```yaml
+    # triggers:
+    #   - type: prometheus
+    #     metricType: Value
+    #     metadata:
+    #       serverAddress: "http://prometheus.example.com"
+    #       threshold: "1"
+    #       metricName: required_workers
+    #         query: >-
+    #           sum by (service)
+    #           (avg_over_time(trino_execution_ClusterSizeMonitor_RequiredWorkers{service={{ include "trino.fullname" . | quote }}}[5s]))
+    # ```
+    annotations: {}
+    # server.keda.annotations -- Annotations to apply to the ScaledObject CRD.
+    # @raw
+    # Example:
+    # ```yaml
+    # annotations:
+    #   autoscaling.keda.sh/paused-replicas: "0"
+    #   autoscaling.keda.sh/paused: "true"
+    # ```
+
 accessControl: {}
 # accessControl -- [System access
 # control](https://trino.io/docs/current/security/built-in-system-access-control.html)
@@ -507,6 +574,12 @@ coordinator:
   config:
     memory:
       heapHeadroomPerNode: ""
+    nodeScheduler:
+      includeCoordinator: false
+      # coordinator.config.nodeScheduler.includeCoordinator -- Allows scheduling work on the coordinator so that a
+      # single machine can function as both coordinator and worker. For large clusters, processing work on the
+      # coordinator can negatively impact query performance because the machine's resources are not available for the
+      # critical coordinator tasks of scheduling, managing, and monitoring query execution.
     query:
       maxMemoryPerNode: "1GB"
 

diff --git a/tests/trino/test-values.yaml b/tests/trino/test-values.yaml
@@ -3,7 +3,7 @@
 # Declare variables to be passed into your templates.
 
 server:
-  workers: 2
+  workers: 0
   config:
     https:
       enabled: true
@@ -14,7 +14,24 @@ server:
     query.client.timeout=5m
     query.execution-policy=phased
   autoscaling:
+    enabled: false
+  keda:
     enabled: true
+    pollingInterval: 5
+    minReplicaCount: 0
+    maxReplicaCount: 2
+    cooldownPeriod: 300
+    triggers:
+      - type: prometheus
+        metricType: Value
+        metadata:
+          serverAddress: http://prometheus-operator-kube-p-prometheus.{{ .Release.Namespace }}:9090
+          threshold: "1"
+          metricName: required_workers
+          query: >-
+            sum by (service)
+            (avg_over_time(trino_execution_ClusterSizeMonitor_RequiredWorkers{service={{ include "trino.fullname" . | quote }}}[5s]))
+
 
 additionalConfigProperties:
   - internal-communication.shared-secret=random-value-999
@@ -247,12 +264,13 @@ jmx:
       rules:
         - pattern: 'trino.memory*'
         - pattern: 'trino.execution<name=QueryManager>*'
+        - pattern: 'trino.execution<name=ClusterSizeMonitor>*'
 
 serviceMonitor:
   enabled: true
   labels:
     prometheus: default
-  interval: "30s"
+  interval: "1s"
 
 ingress:
   enabled: true
@@ -271,3 +289,8 @@ networkPolicy:
               - key: test
                 operator: NotIn
                 values: [network-policy]
+
+catalogs:
+  tpch: |
+    connector.name=tpch
+    tpch.splits-per-node=4
diff --git a/tests/trino/test.sh b/tests/trino/test.sh
@@ -4,7 +4,7 @@ set -euo pipefail
 
 declare -A testCases=(
     [default]=""
-    [single_node]="--set server.workers=0"
+    [single_node]="--set server.workers=0,coordinator.config.nodeScheduler.includeCoordinator=true"
     [complete_values]="--values test-values.yaml"
     [overrides]="--set coordinatorNameOverride=coordinator-overridden,workerNameOverride=worker-overridden,nameOverride=overridden"
     [access_control_properties_values]="--values test-access-control-properties-values.yaml"
@@ -34,6 +34,7 @@ function join_by {
 # default to randomly generated namespace, same as chart-testing would do, but we need to load secrets into the same namespace
 NAMESPACE=trino-$(LC_ALL=C tr -dc 'a-z0-9' </dev/urandom | head -c 6 || true)
 DB_NAMESPACE=postgresql
+KEDA_NAMESPACE=keda
 HELM_EXTRA_SET_ARGS=
 CT_ARGS=(
     --skip-clean-up
@@ -105,8 +106,9 @@ spec:
       storage: 128Mi
 YAML
 
-# only install the Prometheus Helm chart when running the `complete_values` test
+# only install the Prometheus and KEDA Helm charts when running the `complete_values` test
 if printf '%s\0' "${TEST_NAMES[@]}" | grep -qwz complete_values; then
+    # prometheus
     helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
     helm upgrade --install prometheus-operator prometheus-community/kube-prometheus-stack -n "$NAMESPACE" \
         --version "60.0.2" \
@@ -129,6 +131,14 @@ if printf '%s\0' "${TEST_NAMES[@]}" | grep -qwz complete_values; then
         --set prometheusOperator.serviceMonitor.selfMonitor=false \
         --set prometheus.serviceMonitor.selfMonitor=false
     kubectl rollout status --watch deployments -l release=prometheus-operator -n "$NAMESPACE"
+    # keda
+    helm repo add kedacore https://kedacore.github.io/charts
+    helm upgrade --install keda kedacore/keda -n "$KEDA_NAMESPACE" \
+        --create-namespace \
+        --version "2.16.0" \
+        --set webhooks.enabled=false \
+        --set asciiArt=false
+    kubectl rollout status --watch deployments -l app.kubernetes.io/instance=keda -n "$KEDA_NAMESPACE"
 fi
 
 # only install the PostgreSQL Helm chart when running the `resource_groups_properties` test
@@ -171,10 +181,14 @@ if [ "$CLEANUP_NAMESPACE" == "true" ]; then
     kubectl delete namespace "$DB_NAMESPACE" --ignore-not-found
     helm -n "$NAMESPACE" uninstall prometheus-operator --ignore-not-found
     kubectl delete namespace "$NAMESPACE"
-    mapfile -t crds < <(kubectl api-resources --api-group=monitoring.coreos.com --output name)
-    if [ ${#crds[@]} -ne 0 ]; then
-        kubectl delete crd "${crds[@]}"
-    fi
+    helm -n "$KEDA_NAMESPACE" uninstall keda --ignore-not-found
+    kubectl delete namespace "$KEDA_NAMESPACE"
+    for api_group in monitoring.coreos.com eventing.keda.sh keda.sh; do
+        mapfile -t crds < <(kubectl api-resources --api-group="$api_group" --output name)
+        if [ ${#crds[@]} -ne 0 ]; then
+            kubectl delete crd "${crds[@]}"
+        fi
+    done
 fi
 
 exit $result