Retry map task subtasks (#236)

* handling phase transitions and retry attempts to retry only failed subtasks Signed-off-by: Daniel Rammer <[email protected]> * fixed tests and linter Signed-off-by: Daniel Rammer <[email protected]> * added subtask retry attempt to log link id Signed-off-by: Daniel Rammer <[email protected]> * fixed allowing 1 more retry than the maximum number of attempts Signed-off-by: Daniel Rammer <[email protected]> * fixed lint issues Signed-off-by: Daniel Rammer <[email protected]> * updating podName generation to ensure backwards compatibility. Signed-off-by: Daniel Rammer <[email protected]> * fixed lint Signed-off-by: Daniel Rammer <[email protected]> * using existing retryAttempt number when transition running tasks to using subtasks retry attempt array Signed-off-by: Daniel Rammer <[email protected]> * added unit tests Signed-off-by: Daniel Rammer <[email protected]>
flyteorg · Feb 3, 2022 · 14ed6a8 · 14ed6a8
1 parent 6b72fd6
commit 14ed6a8
Show file tree

Hide file tree

Showing 9 changed files with 240 additions and 46 deletions.
diff --git a/go/tasks/plugins/array/core/state.go b/go/tasks/plugins/array/core/state.go
@@ -256,24 +256,23 @@ func MapArrayStateToPluginPhase(_ context.Context, state *State, logLinks []*idl
 func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus.ArraySummary) Phase {
 	totalCount := int64(0)
 	totalSuccesses := int64(0)
-	totalFailures := int64(0)
+	totalPermanentFailures := int64(0)
+	totalRetryableFailures := int64(0)
 	totalRunning := int64(0)
 	totalWaitingForResources := int64(0)
 	for phase, count := range summary {
 		totalCount += count
-		if phase.IsTerminal() {
-			if phase.IsSuccess() {
-				totalSuccesses += count
-			} else {
-				// TODO: Split out retryable failures to be retried without doing the entire array task.
-				// TODO: Other option: array tasks are only retryable as a full set and to get single task retriability
-				// TODO: dynamic_task must be updated to not auto-combine to array tasks.  For scale reasons, it is
-				// TODO: preferable to auto-combine to array tasks for now.
-				totalFailures += count
-			}
-		} else if phase.IsWaitingForResources() {
+
+		switch phase {
+		case core.PhaseSuccess:
+			totalSuccesses += count
+		case core.PhasePermanentFailure:
+			totalPermanentFailures += count
+		case core.PhaseRetryableFailure:
+			totalRetryableFailures += count
+		case core.PhaseWaitingForResources:
 			totalWaitingForResources += count
-		} else {
+		default:
 			totalRunning += count
 		}
 	}
@@ -284,9 +283,9 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus
 	}
 
 	// No chance to reach the required success numbers.
-	if totalRunning+totalSuccesses+totalWaitingForResources < minSuccesses {
-		logger.Infof(ctx, "Array failed early because total failures > minSuccesses[%v]. Snapshot totalRunning[%v] + totalSuccesses[%v] + totalWaitingForResource[%v]",
-			minSuccesses, totalRunning, totalSuccesses, totalWaitingForResources)
+	if totalRunning+totalSuccesses+totalWaitingForResources+totalRetryableFailures < minSuccesses {
+		logger.Infof(ctx, "Array failed early because total failures > minSuccesses[%v]. Snapshot totalRunning[%v] + totalSuccesses[%v] + totalWaitingForResource[%v] + totalRetryableFailures[%v]",
+			minSuccesses, totalRunning, totalSuccesses, totalWaitingForResources, totalRetryableFailures)
 		return PhaseWriteToDiscoveryThenFail
 	}
 
@@ -299,8 +298,8 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus
 		return PhaseWriteToDiscovery
 	}
 
-	logger.Debugf(ctx, "Array is still running [Successes: %v, Failures: %v, Total: %v, MinSuccesses: %v]",
-		totalSuccesses, totalFailures, totalCount, minSuccesses)
+	logger.Debugf(ctx, "Array is still running [Successes: %v, PermanentFailures: %v, RetryableFailures: %v, Total: %v, MinSuccesses: %v]",
+		totalSuccesses, totalPermanentFailures, totalRetryableFailures, totalCount, minSuccesses)
 	return PhaseCheckingSubTaskExecutions
 }
 

diff --git a/go/tasks/plugins/array/core/state_test.go b/go/tasks/plugins/array/core/state_test.go
@@ -315,3 +315,63 @@ func TestToArrayJob(t *testing.T) {
 		}))
 	})
 }
+
+func TestSummaryToPhase(t *testing.T) {
+	minSuccesses := int64(10)
+	tests := []struct {
+		name    string
+		phase   Phase
+		summary map[core.Phase]int64
+	}{
+		{
+			"FailOnTooFewTasks",
+			PhaseWriteToDiscoveryThenFail,
+			map[core.Phase]int64{},
+		},
+		{
+			"ContinueOnRetryableFailures",
+			PhaseCheckingSubTaskExecutions,
+			map[core.Phase]int64{
+				core.PhaseRetryableFailure: 1,
+				core.PhaseUndefined:        9,
+			},
+		},
+		{
+			"FailOnToManyPermanentFailures",
+			PhaseWriteToDiscoveryThenFail,
+			map[core.Phase]int64{
+				core.PhasePermanentFailure: 1,
+				core.PhaseUndefined:        9,
+			},
+		},
+		{
+			"CheckWaitingForResources",
+			PhaseWaitingForResources,
+			map[core.Phase]int64{
+				core.PhaseWaitingForResources: 1,
+				core.PhaseUndefined:           9,
+			},
+		},
+		{
+			"WaitForAllSubtasksToComplete",
+			PhaseCheckingSubTaskExecutions,
+			map[core.Phase]int64{
+				core.PhaseUndefined: 1,
+				core.PhaseSuccess:   9,
+			},
+		},
+		{
+			"SuccessfullyCompleted",
+			PhaseWriteToDiscovery,
+			map[core.Phase]int64{
+				core.PhaseSuccess: 10,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.phase, SummaryToPhase(context.TODO(), minSuccesses, tt.summary))
+		})
+	}
+}
diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go
@@ -121,7 +121,7 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c
 		nextState, err = array.WriteToDiscovery(ctx, tCtx, pluginState, arrayCore.PhaseAssembleFinalOutput)
 
 	case arrayCore.PhaseAssembleFinalError:
-		nextState, err = array.AssembleFinalOutputs(ctx, e.errorAssembler, tCtx, arrayCore.PhaseRetryableFailure, pluginState)
+		nextState, err = array.AssembleFinalOutputs(ctx, e.errorAssembler, tCtx, arrayCore.PhasePermanentFailure, pluginState)
 
 	default:
 		nextState = pluginState

diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go
@@ -3,6 +3,7 @@ package k8s
 import (
 	"context"
 	"fmt"
+	"strconv"
 
 	"github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils"
 
@@ -33,8 +34,17 @@ var arrayJobEnvVars = []corev1.EnvVar{
 	},
 }
 
-func formatSubTaskName(_ context.Context, parentName, suffix string) (subTaskName string) {
-	return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v", parentName, suffix))
+func formatSubTaskName(_ context.Context, parentName string, index int, retryAttempt uint64) (subTaskName string) {
+	indexStr := strconv.Itoa(index)
+
+	// If the retryAttempt is 0 we do not include it in the pod name. The gives us backwards
+	// compatibility in the ability to dynamically transition running map tasks to use subtask retries.
+	if retryAttempt == 0 {
+		return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v", parentName, indexStr))
+	}
+
+	retryAttemptStr := strconv.FormatUint(retryAttempt, 10)
+	return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v-%v", parentName, indexStr, retryAttemptStr))
 }
 
 func ApplyPodPolicies(_ context.Context, cfg *Config, pod *corev1.Pod) *corev1.Pod {

diff --git a/go/tasks/plugins/array/k8s/launcher_test.go b/go/tasks/plugins/array/k8s/launcher_test.go
@@ -2,6 +2,7 @@ package k8s
 
 import (
 	"context"
+	"fmt"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -39,3 +40,25 @@ func TestApplyPodTolerations(t *testing.T) {
 
 	assert.Equal(t, pod.Spec.Tolerations, cfg.Tolerations)
 }
+
+func TestFormatSubTaskName(t *testing.T) {
+	ctx := context.Background()
+	parentName := "foo"
+
+	tests := []struct {
+		index        int
+		retryAttempt uint64
+		want         string
+	}{
+		{0, 0, fmt.Sprintf("%v-%v", parentName, 0)},
+		{1, 0, fmt.Sprintf("%v-%v", parentName, 1)},
+		{0, 1, fmt.Sprintf("%v-%v-%v", parentName, 0, 1)},
+		{1, 1, fmt.Sprintf("%v-%v-%v", parentName, 1, 1)},
+	}
+
+	for i, tt := range tests {
+		t.Run(fmt.Sprintf("format-subtask-name-%v", i), func(t *testing.T) {
+			assert.Equal(t, tt.want, formatSubTaskName(ctx, parentName, tt.index, tt.retryAttempt))
+		})
+	}
+}
diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go
@@ -3,7 +3,6 @@ package k8s
 import (
 	"context"
 	"fmt"
-	"strconv"
 	"time"
 
 	"github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog"
@@ -73,10 +72,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon
 			return currentState, logLinks, subTaskIDs, nil
 		}
 
-		// Currently if any subtask fails then all subtasks are retried up to MaxAttempts. Therefore, all
-		// subtasks have an identical RetryAttempt, namely that of the map task execution metadata. Once
-		// retries over individual subtasks are implemented we should revisit this logic and instead
-		// increment the RetryAttempt for each subtask everytime a new pod is created.
+		// Set subtask retryAttempts using the existing task context retry attempt. For new tasks
+		// this will initialize to 0, but running tasks will use the existing retry attempt.
 		retryAttempt := bitarray.Item(tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID().RetryAttempt)
 		for i := 0; i < currentState.GetExecutionArraySize(); i++ {
 			retryAttemptsArray.SetItem(i, retryAttempt)
@@ -93,20 +90,38 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon
 
 	for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() {
 		existingPhase := core.Phases[existingPhaseIdx]
-		indexStr := strconv.Itoa(childIdx)
-		podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr)
 		originalIdx := arrayCore.CalculateOriginalIndex(childIdx, newState.GetIndexesToCache())
 
+		retryAttempt := currentState.RetryAttempts.GetItem(childIdx)
+		podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), childIdx, retryAttempt)
+
 		if existingPhase.IsTerminal() {
 			// If we get here it means we have already "processed" this terminal phase since we will only persist
 			// the phase after all processing is done (e.g. check outputs/errors file, record events... etc.).
 
 			// Since we know we have already "processed" this terminal phase we can safely deallocate resource
-			err = deallocateResource(ctx, tCtx, config, childIdx)
+			err = deallocateResource(ctx, tCtx, config, podName)
 			if err != nil {
 				logger.Errorf(ctx, "Error releasing allocation token [%s] in LaunchAndCheckSubTasks [%s]", podName, err)
 				return currentState, logLinks, subTaskIDs, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.")
 			}
+
+			// If a subtask is marked as a retryable failure we check if the number of retries
+			// exceeds the maximum attempts. If so, transition the task to a permanent failure
+			// so that is not attempted again. If it can be retried, increment the retry attempts
+			// value and transition the task to "Undefined" so that it is reevaluated.
+			if existingPhase == core.PhaseRetryableFailure {
+				if uint32(retryAttempt+1) < tCtx.TaskExecutionMetadata().GetMaxAttempts() {
+					newState.RetryAttempts.SetItem(childIdx, retryAttempt+1)
+
+					newArrayStatus.Summary.Inc(core.PhaseUndefined)
+					newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseUndefined))
+					continue
+				} else {
+					existingPhase = core.PhasePermanentFailure
+				}
+			}
+
 			newArrayStatus.Summary.Inc(existingPhase)
 			newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(existingPhase))
 
@@ -117,6 +132,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon
 				},
 				originalIdx,
 				tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID().RetryAttempt,
+				retryAttempt,
 				logPlugin)
 
 			if err != nil {
@@ -209,7 +225,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon
 	return newState, logLinks, subTaskIDs, nil
 }
 
-func FetchPodStatusAndLogs(ctx context.Context, client core.KubeClient, name k8sTypes.NamespacedName, index int, retryAttempt uint32, logPlugin tasklog.Plugin) (
+func FetchPodStatusAndLogs(ctx context.Context, client core.KubeClient, name k8sTypes.NamespacedName, index int, retryAttempt uint32, subtaskRetryAttempt uint64, logPlugin tasklog.Plugin) (
 	info core.PhaseInfo, err error) {
 
 	pod := &v1.Pod{
@@ -244,12 +260,20 @@ func FetchPodStatusAndLogs(ctx context.Context, client core.KubeClient, name k8s
 	}
 
 	if pod.Status.Phase != v1.PodPending && pod.Status.Phase != v1.PodUnknown {
+		// We append the subtaskRetryAttempt to the log name only when it is > 0 to ensure backwards
+		// compatibility when dynamically transitioning running map tasks to use subtask retry attempts.
+		var logName string
+		if subtaskRetryAttempt == 0 {
+			logName = fmt.Sprintf(" #%d-%d", retryAttempt, index)
+		} else {
+			logName = fmt.Sprintf(" #%d-%d-%d", retryAttempt, index, subtaskRetryAttempt)
+		}
 
 		if logPlugin != nil {
 			o, err := logPlugin.GetTaskLogs(tasklog.Input{
 				PodName:          pod.Name,
 				Namespace:        pod.Namespace,
-				LogName:          fmt.Sprintf(" #%d-%d", retryAttempt, index),
+				LogName:          logName,
 				PodUnixStartTime: pod.CreationTimestamp.Unix(),
 			})
 

diff --git a/go/tasks/plugins/array/k8s/monitor_test.go b/go/tasks/plugins/array/k8s/monitor_test.go
@@ -216,6 +216,77 @@ func TestCheckSubTasksState(t *testing.T) {
 		resourceManager.AssertNumberOfCalls(t, "AllocateResource", 5)
 		assert.Empty(t, subTaskIDs, "subtask ids are only populated when monitor is called for a successfully launched task")
 	})
+
+	t.Run("RetryableSubtaskFailure", func(t *testing.T) {
+		failureIndex := 2
+
+		config := Config{
+			MaxArrayJobSize:      100,
+			MaxErrorStringLength: 200,
+		}
+
+		detailed := arrayCore.NewPhasesCompactArray(uint(5))
+		detailed.SetItem(failureIndex, bitarray.Item(core.PhaseRetryableFailure))
+
+		retryAttemptsArray, err := bitarray.NewCompactArray(5, bitarray.Item(1))
+		assert.NoError(t, err)
+
+		cacheIndexes := bitarray.NewBitSet(5)
+		newState, _, _, err := LaunchAndCheckSubTasksState(ctx, tCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", &arrayCore.State{
+			CurrentPhase:         arrayCore.PhaseCheckingSubTaskExecutions,
+			ExecutionArraySize:   5,
+			OriginalArraySize:    10,
+			OriginalMinSuccesses: 5,
+			IndexesToCache:       cacheIndexes,
+			ArrayStatus: arraystatus.ArrayStatus{
+				Detailed: detailed,
+			},
+			RetryAttempts: retryAttemptsArray,
+		})
+
+		assert.Nil(t, err)
+
+		p, _ := newState.GetPhase()
+		assert.Equal(t, arrayCore.PhaseCheckingSubTaskExecutions.String(), p.String())
+		assert.Equal(t, core.PhaseUndefined, core.Phases[newState.ArrayStatus.Detailed.GetItem(failureIndex)])
+		assert.Equal(t, uint64(1), newState.RetryAttempts.GetItem(failureIndex))
+	})
+
+	t.Run("PermanentSubtaskFailure", func(t *testing.T) {
+		failureIndex := 2
+
+		config := Config{
+			MaxArrayJobSize:      100,
+			MaxErrorStringLength: 200,
+		}
+
+		detailed := arrayCore.NewPhasesCompactArray(uint(5))
+		detailed.SetItem(failureIndex, bitarray.Item(core.PhaseRetryableFailure))
+
+		retryAttemptsArray, err := bitarray.NewCompactArray(5, bitarray.Item(1))
+		assert.NoError(t, err)
+		retryAttemptsArray.SetItem(failureIndex, bitarray.Item(1))
+
+		cacheIndexes := bitarray.NewBitSet(5)
+		newState, _, _, err := LaunchAndCheckSubTasksState(ctx, tCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", &arrayCore.State{
+			CurrentPhase:         arrayCore.PhaseCheckingSubTaskExecutions,
+			ExecutionArraySize:   5,
+			OriginalArraySize:    10,
+			OriginalMinSuccesses: 5,
+			IndexesToCache:       cacheIndexes,
+			ArrayStatus: arraystatus.ArrayStatus{
+				Detailed: detailed,
+			},
+			RetryAttempts: retryAttemptsArray,
+		})
+
+		assert.Nil(t, err)
+
+		p, _ := newState.GetPhase()
+		assert.Equal(t, arrayCore.PhaseCheckingSubTaskExecutions.String(), p.String())
+		assert.Equal(t, core.PhasePermanentFailure, core.Phases[newState.ArrayStatus.Detailed.GetItem(failureIndex)])
+		assert.Equal(t, uint64(1), newState.RetryAttempts.GetItem(failureIndex))
+	})
 }
 
 func TestCheckSubTasksStateResourceGranted(t *testing.T) {