From cb2397a41de96b1127fdcbb87556355391092d6b Mon Sep 17 00:00:00 2001 From: jswxstw Date: Wed, 26 Jun 2024 20:22:14 +0800 Subject: [PATCH] fix: Skip execution control for nodes of exit handler. Fixes #13060 Signed-off-by: oninowang --- workflow/controller/exec_control.go | 2 +- workflow/controller/exit_handler_test.go | 2 -- workflow/controller/operator.go | 20 ++++++++++++-------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/workflow/controller/exec_control.go b/workflow/controller/exec_control.go index a58ac03767768..b341fa343e2e7 100644 --- a/workflow/controller/exec_control.go +++ b/workflow/controller/exec_control.go @@ -96,7 +96,7 @@ func (woc *wfOperationCtx) handleExecutionControlError(nodeID string, wfNodesLoc // if node is a pod created from ContainerSet template // then need to fail child nodes so they will not hang in Pending after pod deletion for _, child := range children { - if !child.IsExitNode() && !child.Fulfilled() { + if !child.Fulfilled() { woc.markNodePhase(child.Name, wfv1.NodeFailed, errorMsg) } } diff --git a/workflow/controller/exit_handler_test.go b/workflow/controller/exit_handler_test.go index 4f6b989b1a350..9749643870c66 100644 --- a/workflow/controller/exit_handler_test.go +++ b/workflow/controller/exit_handler_test.go @@ -828,8 +828,6 @@ status: type: Steps hello-world-647r7-1045616760: boundaryID: hello-world-647r7-206029318 - children: - - hello-world-647r7-370991976 displayName: '[0]' finishedAt: null id: hello-world-647r7-1045616760 diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index ae404bedbbf4c..295d37d8cd17c 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1272,20 +1272,24 @@ func (woc *wfOperationCtx) shouldPrintPodSpec(node *wfv1.NodeStatus) bool { // failNodesWithoutCreatedPodsAfterDeadlineOrShutdown mark the nodes without created pods failed when shutting down or exceeding deadline. func (woc *wfOperationCtx) failNodesWithoutCreatedPodsAfterDeadlineOrShutdown() { - for _, node := range woc.wf.Status.Nodes { + nodes := woc.wf.Status.Nodes + for _, node := range nodes { if node.Fulfilled() { continue } - // fail suspended nodes or taskset nodes when shutting down - if woc.GetShutdownStrategy().Enabled() && (node.IsActiveSuspendNode() || node.IsTaskSetNode()) { - message := fmt.Sprintf("Stopped with strategy '%s'", woc.GetShutdownStrategy()) - woc.markNodePhase(node.Name, wfv1.NodeFailed, message) - continue + // Only fail nodes that are not part of exit handler if we are "Stopping" or all pods if we are "Terminating" + if woc.GetShutdownStrategy().Enabled() && !woc.GetShutdownStrategy().ShouldExecute(node.IsPartOfExitHandler(nodes)) { + // fail suspended nodes or taskset nodes when shutting down + if node.IsActiveSuspendNode() || node.IsTaskSetNode() { + message := fmt.Sprintf("Stopped with strategy '%s'", woc.GetShutdownStrategy()) + woc.markNodePhase(node.Name, wfv1.NodeFailed, message) + continue + } } - // fail all pending and suspended nodes when exceeding deadline + // fail pending and suspended nodes that are not part of exit handler when exceeding deadline deadlineExceeded := woc.workflowDeadline != nil && time.Now().UTC().After(*woc.workflowDeadline) - if deadlineExceeded && (node.Phase == wfv1.NodePending || node.IsActiveSuspendNode()) { + if deadlineExceeded && !node.IsPartOfExitHandler(nodes) && (node.Phase == wfv1.NodePending || node.IsActiveSuspendNode()) { message := "Step exceeded its deadline" woc.markNodePhase(node.Name, wfv1.NodeFailed, message) continue