Skip to content

Commit

Permalink
fix: mark taskresult complete when failed or error. Fixes #12993, Fixes
Browse files Browse the repository at this point in the history
#13533 (#13798)

Signed-off-by: isubasinghe <[email protected]>
  • Loading branch information
isubasinghe authored Oct 28, 2024
1 parent f7e6ddf commit eb23eb6
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 11 deletions.
2 changes: 1 addition & 1 deletion docs/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ This document outlines environment variables that can be used to customize behav
| `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int` | `6` | The number of buckets to collect the metric for the operation duration. |
| `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. |
| `RECENTLY_STARTED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently started. |
| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently deleted. |
| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `2m` | The duration of a pod before the pod is considered to be recently deleted. |
| `RETRY_BACKOFF_DURATION` | `time.Duration` | `10ms` | The retry back-off duration when retrying API calls. |
| `RETRY_BACKOFF_FACTOR` | `float` | `2.0` | The retry back-off factor when retrying API calls. |
| `RETRY_BACKOFF_STEPS` | `int` | `5` | The retry back-off steps when retrying API calls. |
Expand Down
5 changes: 0 additions & 5 deletions pkg/apis/workflow/v1alpha1/workflow_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2409,11 +2409,6 @@ func (n NodeStatus) IsExitNode() bool {
return strings.HasSuffix(n.DisplayName, ".onExit")
}

// IsPodDeleted returns whether node is error with pod deleted.
func (n NodeStatus) IsPodDeleted() bool {
return n.Phase == NodeError && n.Message == "pod deleted"
}

func (n NodeStatus) Succeeded() bool {
return n.Phase == NodeSucceeded
}
Expand Down
10 changes: 5 additions & 5 deletions workflow/controller/taskresult.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex
}

func recentlyDeleted(node *wfv1.NodeStatus) bool {
return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 10*time.Second)
return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
}

func (woc *wfOperationCtx) taskResultReconciliation() {
Expand Down Expand Up @@ -84,19 +84,19 @@ func (woc *wfOperationCtx) taskResultReconciliation() {
if err != nil {
continue
}

// Mark task result as completed if it has no chance to be completed.
if label == "false" && old.IsPodDeleted() {
if label == "false" && old.Completed() && !woc.nodePodExist(*old) {
if recentlyDeleted(old) {
woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.")
// If the pod was deleted, then it is possible that the controller never get another informer message about it.
// In this case, the workflow will only be requeued after the resync period (20m). This means
// workflow will not update for 20m. Requeuing here prevents that happening.
woc.requeue()
continue
} else {
woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
woc.wf.Status.MarkTaskResultComplete(nodeID)
}
woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
woc.wf.Status.MarkTaskResultComplete(nodeID)
}
newNode := old.DeepCopy()
if result.Outputs.HasOutputs() {
Expand Down

0 comments on commit eb23eb6

Please sign in to comment.