trinodb · nineinchnick · Feb 10, 2025 · Dec 13, 2024 · Jan 11, 2025
diff --git a/charts/trino/README.md b/charts/trino/README.md
@@ -78,8 +78,9 @@ Fast distributed SQL query engine for big data analytics that helps you explore
   ```
 * `server.workerExtraConfig` - string, default: `""`
 * `server.coordinatorExtraConfig` - string, default: `""`
-* `server.autoscaling.enabled` - bool, default: `false`
-* `server.autoscaling.maxReplicas` - int, default: `5`
+* `server.autoscaling` - object, default: `{"behavior":{},"enabled":false,"maxReplicas":5,"targetCPUUtilizationPercentage":50,"targetMemoryUtilizationPercentage":80}`  
+
+  Configure [Horizontal Pod Autoscaling](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) for workers (`server.keda.enabled` must be `false`).
 * `server.autoscaling.targetCPUUtilizationPercentage` - int, default: `50`  
 
   Target average CPU utilization, represented as a percentage of requested CPU. To disable scaling based on CPU, set to an empty string.
@@ -108,6 +109,70 @@ Fast distributed SQL query engine for big data analytics that helps you explore
        periodSeconds: 15
      selectPolicy: Max
   ```
+* `server.keda` - object, default: `{"advanced":{},"annotations":{},"cooldownPeriod":300,"enabled":false,"fallback":{},"initialCooldownPeriod":0,"maxReplicaCount":5,"minReplicaCount":0,"pollingInterval":30,"triggers":[]}`  
+
+  Configure [Kubernetes Event-driven Autoscaling](https://keda.sh/) for workers (`server.autoscaling.enabled` must be `false`).
+* `server.keda.cooldownPeriod` - int, default: `300`  
+
+  Period (in seconds) to wait after the last trigger reported active before scaling the resource back to 0
+* `server.keda.initialCooldownPeriod` - int, default: `0`  
+
+  The delay (in seconds) before the `cooldownPeriod` starts after the initial creation of the `ScaledObject`.
+* `server.keda.minReplicaCount` - int, default: `0`  
+
+  Minimum number of replicas KEDA will scale the resource down to. By default, it’s scale to zero, but you can use it with some other value as well.
+* `server.keda.maxReplicaCount` - int, default: `5`  
+
+  This setting is passed to the HPA definition that KEDA will create for a given resource and holds the maximum number of replicas of the target resource.
+* `server.keda.fallback` - object, default: `{}`  
+
+  Defines a number of replicas to fall back to if a scaler is in an error state.
+  Example:
+  ```yaml
+  fallback:             # Optional. Section to specify fallback options
+    failureThreshold: 3 # Mandatory if fallback section is included
+    replicas: 6         # Mandatory if fallback section is included
+  ```
+* `server.keda.advanced` - object, default: `{}`  
+
+  Specifies HPA related options
+  Example:
+  ```yaml
+  advanced:
+    horizontalPodAutoscalerConfig:
+      behavior:
+        scaleDown:
+          stabilizationWindowSeconds: 300
+          policies:
+            - type: Percent
+              value: 100
+              periodSeconds: 15
+  ```
+* `server.keda.triggers` - list, default: `[]`  
+
+  List of triggers to activate scaling of the target resource
+  Example:
+  ```yaml
+  triggers:
+    - type: prometheus
+      metricType: Value
+      metadata:
+        serverAddress: "http://prometheus.example.com"
+        threshold: "1"
+        metricName: required_workers
+          query: >-
+            sum by (service)
+            (avg_over_time(trino_execution_ClusterSizeMonitor_RequiredWorkers{service={{ include "trino.fullname" . | quote }}}[5s]))
+  ```
+* `server.keda.annotations` - object, default: `{}`  
+
+  Annotations to apply to the ScaledObject CRD.
+  Example:
+  ```yaml
+  annotations:
+    autoscaling.keda.sh/paused-replicas: "0"
+    autoscaling.keda.sh/paused: "true"
+  ```
 * `accessControl` - object, default: `{}`  
 
   [System access control](https://trino.io/docs/current/security/built-in-system-access-control.html) configuration.
@@ -435,6 +500,9 @@ Fast distributed SQL query engine for big data analytics that helps you explore
 * `coordinator.jvm.gcMethod.type` - string, default: `"UseG1GC"`
 * `coordinator.jvm.gcMethod.g1.heapRegionSize` - string, default: `"32M"`
 * `coordinator.config.memory.heapHeadroomPerNode` - string, default: `""`
+* `coordinator.config.nodeScheduler.includeCoordinator` - bool, default: `false`  
+
+  Allows scheduling work on the coordinator so that a single machine can function as both coordinator and worker. For large clusters, processing work on the coordinator can negatively impact query performance because the machine's resources are not available for the critical coordinator tasks of scheduling, managing, and monitoring query execution.
 * `coordinator.config.query.maxMemoryPerNode` - string, default: `"1GB"`
 * `coordinator.additionalJVMConfig` - list, default: `[]`
 * `coordinator.additionalExposedPorts` - object, default: `{}`  

diff --git a/charts/trino/templates/configmap-coordinator.yaml b/charts/trino/templates/configmap-coordinator.yaml
@@ -50,11 +50,7 @@ data:
 
   config.properties: |
     coordinator=true
-    {{- if gt (int .Values.server.workers) 0 }}
-    node-scheduler.include-coordinator=false
-    {{- else }}
-    node-scheduler.include-coordinator=true
-    {{- end }}
+    node-scheduler.include-coordinator={{ .Values.coordinator.config.nodeScheduler.includeCoordinator }}
     http-server.http.port={{ .Values.service.port }}
     query.max-memory={{ .Values.server.config.query.maxMemory }}
     query.max-memory-per-node={{ .Values.coordinator.config.query.maxMemoryPerNode }}

diff --git a/charts/trino/templates/configmap-worker.yaml b/charts/trino/templates/configmap-worker.yaml
@@ -1,5 +1,5 @@
 {{- $workerJmx := merge .Values.jmx.worker (omit .Values.jmx "coordinator" "worker") -}}
-{{- if gt (int .Values.server.workers) 0 }}
+{{- if or .Values.server.keda.enabled (gt (int .Values.server.workers) 0) }}
 apiVersion: v1
 kind: ConfigMap
 metadata:

diff --git a/charts/trino/templates/deployment-worker.yaml b/charts/trino/templates/deployment-worker.yaml
@@ -1,5 +1,5 @@
 {{- $workerJmx := merge .Values.jmx.worker (omit .Values.jmx "coordinator" "worker") -}}
-{{- if gt (int .Values.server.workers) 0 }}
+{{- if or .Values.server.keda.enabled (gt (int .Values.server.workers) 0) }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -19,7 +19,7 @@ spec:
   revisionHistoryLimit: {{ .Values.worker.deployment.revisionHistoryLimit }}
   strategy:
     {{- toYaml .Values.worker.deployment.strategy | nindent 4 }}
-  {{- if not .Values.server.autoscaling.enabled }}
+  {{- if and (not .Values.server.autoscaling.enabled) (not .Values.server.keda.enabled) }}
   replicas: {{ .Values.server.workers }}
   {{- end }}
   selector:

diff --git a/charts/trino/templates/keda-scaledobject.yaml b/charts/trino/templates/keda-scaledobject.yaml
@@ -0,0 +1,40 @@
+{{- if .Values.server.keda.enabled }}
+{{- if .Values.server.autoscaling.enabled }}
+{{- fail "The worker Kubernetes Event-driven Autoscaling configuration (`server.keda`) conflicts with the worker Horizontal Pod Autoscaling (`server.autoscaling`). They cannot be both enabled at the same time!" }}
+{{- end }}
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: {{ template "trino.worker" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "trino.labels" . | nindent 4 }}
+  {{- with .Values.server.keda.annotations }}
+  annotations:
+    {{- . | nindent 4 }}
+  {{- end }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ template "trino.worker" . }}
+  pollingInterval: {{ .Values.server.keda.pollingInterval }}
+  cooldownPeriod: {{ .Values.server.keda.cooldownPeriod }}
+  initialCooldownPeriod: {{ .Values.server.keda.initialCooldownPeriod }}
+  minReplicaCount: {{ .Values.server.keda.minReplicaCount }}
+  maxReplicaCount: {{ .Values.server.keda.maxReplicaCount }}
+  {{- with .Values.server.keda.fallback }}
+  fallback:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  {{- with .Values.server.keda.advanced }}
+  advanced:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  {{- with .Values.server.keda.triggers }}
+  triggers:
+    {{- tpl (toYaml .) $ | nindent 4 }}
+  {{- else }}
+    {{- fail "At least one element in `.Values.server.keda.triggers` is required!" }}
+  {{- end }}
+{{- end }}
diff --git a/charts/trino/templates/tests/test-connection.yaml b/charts/trino/templates/tests/test-connection.yaml
@@ -42,7 +42,7 @@ spec:
       - --password
     {{- end }}
       - --debug
-      - --execute=SELECT 1
+      - --execute=SELECT COUNT(*) FROM tpch.tiny.nation
       - --no-progress
     {{- if eq .Values.server.config.authenticationType "PASSWORD" }}
       env:

diff --git a/charts/trino/values.yaml b/charts/trino/values.yaml
@@ -82,6 +82,8 @@ server:
 
   workerExtraConfig: ""
   coordinatorExtraConfig: ""
+  # server.autoscaling -- Configure [Horizontal Pod Autoscaling](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
+  # for workers (`server.keda.enabled` must be `false`).
   autoscaling:
     enabled: false
     maxReplicas: 5
@@ -114,6 +116,71 @@ server:
     #    selectPolicy: Max
     # ```
 
+  # server.keda -- Configure [Kubernetes Event-driven Autoscaling](https://keda.sh/) for workers
+  # (`server.autoscaling.enabled` must be `false`).
+  keda:
+    enabled: false
+    pollingInterval: 30
+    # -- Period (in seconds) to wait after the last trigger reported active before scaling the resource back to 0
+    cooldownPeriod: 300
+    # -- The delay (in seconds) before the `cooldownPeriod` starts after the initial creation of the `ScaledObject`.
+    initialCooldownPeriod: 0
+    # -- Minimum number of replicas KEDA will scale the resource down to.
+    # By default, it’s scale to zero, but you can use it with some other value as well.
+    minReplicaCount: 0
+    # -- This setting is passed to the HPA definition that KEDA will create for a given resource and
+    # holds the maximum number of replicas of the target resource.
+    maxReplicaCount: 5
+    fallback: {}
+    # server.keda.fallback -- Defines a number of replicas to fall back to if a scaler is in an error state.
+    # @raw
+    # Example:
+    # ```yaml
+    # fallback:             # Optional. Section to specify fallback options
+    #   failureThreshold: 3 # Mandatory if fallback section is included
+    #   replicas: 6         # Mandatory if fallback section is included
+    # ```
+    advanced: {}
+    # server.keda.advanced -- Specifies HPA related options
+    # @raw
+    # Example:
+    # ```yaml
+    # advanced:
+    #   horizontalPodAutoscalerConfig:
+    #     behavior:
+    #       scaleDown:
+    #         stabilizationWindowSeconds: 300
+    #         policies:
+    #           - type: Percent
+    #             value: 100
+    #             periodSeconds: 15
+    # ```
+    triggers: []
+    # server.keda.triggers -- List of triggers to activate scaling of the target resource
+    # @raw
+    # Example:
+    # ```yaml
+    # triggers:
+    #   - type: prometheus
+    #     metricType: Value
+    #     metadata:
+    #       serverAddress: "http://prometheus.example.com"
+    #       threshold: "1"
+    #       metricName: required_workers
+    #         query: >-
+    #           sum by (service)
+    #           (avg_over_time(trino_execution_ClusterSizeMonitor_RequiredWorkers{service={{ include "trino.fullname" . | quote }}}[5s]))
+    # ```
+    annotations: {}
+    # server.keda.annotations -- Annotations to apply to the ScaledObject CRD.
+    # @raw
+    # Example:
+    # ```yaml
+    # annotations:
+    #   autoscaling.keda.sh/paused-replicas: "0"
+    #   autoscaling.keda.sh/paused: "true"
+    # ```
+
 accessControl: {}
 # accessControl -- [System access
 # control](https://trino.io/docs/current/security/built-in-system-access-control.html)
@@ -507,6 +574,12 @@ coordinator:
   config:
     memory:
       heapHeadroomPerNode: ""
+    nodeScheduler:
+      includeCoordinator: false
+      # coordinator.config.nodeScheduler.includeCoordinator -- Allows scheduling work on the coordinator so that a
+      # single machine can function as both coordinator and worker. For large clusters, processing work on the
+      # coordinator can negatively impact query performance because the machine's resources are not available for the
+      # critical coordinator tasks of scheduling, managing, and monitoring query execution.
     query:
       maxMemoryPerNode: "1GB"
 

diff --git a/tests/trino/test-values.yaml b/tests/trino/test-values.yaml
@@ -3,7 +3,7 @@
 # Declare variables to be passed into your templates.
 
 server:
-  workers: 2
+  workers: 0
   config:
     https:
       enabled: true
@@ -14,7 +14,24 @@ server:
     query.client.timeout=5m
     query.execution-policy=phased
   autoscaling:
+    enabled: false
+  keda:
     enabled: true
+    pollingInterval: 5
+    minReplicaCount: 0
+    maxReplicaCount: 2
+    cooldownPeriod: 300
+    triggers:
+      - type: prometheus
+        metricType: Value
+        metadata:
+          serverAddress: http://prometheus-operator-kube-p-prometheus.{{ .Release.Namespace }}:9090
+          threshold: "1"
+          metricName: required_workers
+          query: >-
+            sum by (service)
+            (avg_over_time(trino_execution_ClusterSizeMonitor_RequiredWorkers{service={{ include "trino.fullname" . | quote }}}[5s]))
+
 
 additionalConfigProperties:
   - internal-communication.shared-secret=random-value-999
@@ -247,12 +264,13 @@ jmx:
       rules:
         - pattern: 'trino.memory*'
         - pattern: 'trino.execution<name=QueryManager>*'
+        - pattern: 'trino.execution<name=ClusterSizeMonitor>*'
 
 serviceMonitor:
   enabled: true
   labels:
     prometheus: default
-  interval: "30s"
+  interval: "1s"
 
 ingress:
   enabled: true
@@ -271,3 +289,8 @@ networkPolicy:
               - key: test
                 operator: NotIn
                 values: [network-policy]
+
+catalogs:
+  tpch: |
+    connector.name=tpch
+    tpch.splits-per-node=4