From 9d7bd8645eae4c7ede2e59ecfeaafe387fb63d87 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Mon, 14 Feb 2022 08:43:15 -0600 Subject: [PATCH 01/20] refactored to use pod plugin in map task subtasks Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/executor.go | 12 +- go/tasks/plugins/array/k8s/launcher.go | 101 ----- go/tasks/plugins/array/k8s/launcher_test.go | 64 ---- go/tasks/plugins/array/k8s/management.go | 276 ++++++++++++++ .../{monitor_test.go => management_test.go} | 5 +- go/tasks/plugins/array/k8s/monitor.go | 348 ------------------ go/tasks/plugins/array/k8s/subtask.go | 180 +++++++++ .../plugins/array/k8s/subtask_exec_context.go | 150 ++++++++ go/tasks/plugins/array/k8s/task.go | 331 ----------------- go/tasks/plugins/array/k8s/task_test.go | 136 ------- go/tasks/plugins/array/k8s/transformer.go | 182 --------- .../plugins/array/k8s/transformer_test.go | 257 ------------- go/tasks/plugins/k8s/pod/container.go | 2 +- go/tasks/plugins/k8s/pod/plugin.go | 41 +-- go/tasks/plugins/k8s/pod/sidecar.go | 10 +- 15 files changed, 640 insertions(+), 1455 deletions(-) delete mode 100644 go/tasks/plugins/array/k8s/launcher.go delete mode 100644 go/tasks/plugins/array/k8s/launcher_test.go create mode 100644 go/tasks/plugins/array/k8s/management.go rename go/tasks/plugins/array/k8s/{monitor_test.go => management_test.go} (99%) delete mode 100644 go/tasks/plugins/array/k8s/monitor.go create mode 100644 go/tasks/plugins/array/k8s/subtask.go create mode 100644 go/tasks/plugins/array/k8s/subtask_exec_context.go delete mode 100644 go/tasks/plugins/array/k8s/task.go delete mode 100644 go/tasks/plugins/array/k8s/task_test.go delete mode 100644 go/tasks/plugins/array/k8s/transformer.go delete mode 100644 go/tasks/plugins/array/k8s/transformer_test.go diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index ceb760ca1..ecb2aeb44 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -3,20 +3,20 @@ package k8s import ( "context" - "sigs.k8s.io/controller-runtime/pkg/cache" - "sigs.k8s.io/controller-runtime/pkg/client" - idlCore "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" + "github.com/flyteorg/flyteplugins/go/tasks/errors" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" "github.com/flyteorg/flyteplugins/go/tasks/plugins/array" arrayCore "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/core" + "github.com/flyteorg/flytestdlib/logger" "github.com/flyteorg/flytestdlib/promutils" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" - "github.com/flyteorg/flyteplugins/go/tasks/errors" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" ) const executorName = "k8s-array" diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go deleted file mode 100644 index 3b2d9510a..000000000 --- a/go/tasks/plugins/array/k8s/launcher.go +++ /dev/null @@ -1,101 +0,0 @@ -package k8s - -import ( - "context" - "fmt" - "strconv" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils" - - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/errorcollector" - - arrayCore "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/core" - - errors2 "github.com/flyteorg/flytestdlib/errors" - - corev1 "k8s.io/api/core/v1" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" -) - -const ( - ErrBuildPodTemplate errors2.ErrorCode = "POD_TEMPLATE_FAILED" - ErrReplaceCmdTemplate errors2.ErrorCode = "CMD_TEMPLATE_FAILED" - ErrSubmitJob errors2.ErrorCode = "SUBMIT_JOB_FAILED" - ErrGetTaskTypeVersion errors2.ErrorCode = "GET_TASK_TYPE_VERSION_FAILED" - JobIndexVarName string = "BATCH_JOB_ARRAY_INDEX_VAR_NAME" - FlyteK8sArrayIndexVarName string = "FLYTE_K8S_ARRAY_INDEX" -) - -var arrayJobEnvVars = []corev1.EnvVar{ - { - Name: JobIndexVarName, - Value: FlyteK8sArrayIndexVarName, - }, -} - -func formatSubTaskName(_ context.Context, parentName string, index int, retryAttempt uint64) (subTaskName string) { - indexStr := strconv.Itoa(index) - - // If the retryAttempt is 0 we do not include it in the pod name. The gives us backwards - // compatibility in the ability to dynamically transition running map tasks to use subtask retries. - if retryAttempt == 0 { - return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v", parentName, indexStr)) - } - - retryAttemptStr := strconv.FormatUint(retryAttempt, 10) - return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v-%v", parentName, indexStr, retryAttemptStr)) -} - -func ApplyPodPolicies(_ context.Context, cfg *Config, pod *corev1.Pod) *corev1.Pod { - if len(cfg.DefaultScheduler) > 0 { - pod.Spec.SchedulerName = cfg.DefaultScheduler - } - - return pod -} - -func applyNodeSelectorLabels(_ context.Context, cfg *Config, pod *corev1.Pod) *corev1.Pod { - if len(cfg.NodeSelector) != 0 { - pod.Spec.NodeSelector = cfg.NodeSelector - } - - return pod -} - -func applyPodTolerations(_ context.Context, cfg *Config, pod *corev1.Pod) *corev1.Pod { - if len(cfg.Tolerations) != 0 { - pod.Spec.Tolerations = cfg.Tolerations - } - - return pod -} - -func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, - currentState *arrayCore.State) error { - - size := currentState.GetExecutionArraySize() - errs := errorcollector.NewErrorMessageCollector() - for childIdx := 0; childIdx < size; childIdx++ { - task := Task{ - ChildIdx: childIdx, - Config: config, - State: currentState, - } - - err := task.Abort(ctx, tCtx, kubeClient) - if err != nil { - errs.Collect(childIdx, err.Error()) - } - err = task.Finalize(ctx, tCtx, kubeClient) - if err != nil { - errs.Collect(childIdx, err.Error()) - } - } - - if errs.Length() > 0 { - return fmt.Errorf(errs.Summary(config.MaxErrorStringLength)) - } - - return nil -} diff --git a/go/tasks/plugins/array/k8s/launcher_test.go b/go/tasks/plugins/array/k8s/launcher_test.go deleted file mode 100644 index 729f4c0ea..000000000 --- a/go/tasks/plugins/array/k8s/launcher_test.go +++ /dev/null @@ -1,64 +0,0 @@ -package k8s - -import ( - "context" - "fmt" - "testing" - - "github.com/stretchr/testify/assert" - corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" -) - -func TestApplyNodeSelectorLabels(t *testing.T) { - ctx := context.Background() - cfg := &Config{ - NodeSelector: map[string]string{ - "disktype": "ssd", - }, - } - pod := &corev1.Pod{} - - pod = applyNodeSelectorLabels(ctx, cfg, pod) - - assert.Equal(t, pod.Spec.NodeSelector, cfg.NodeSelector) -} - -func TestApplyPodTolerations(t *testing.T) { - ctx := context.Background() - cfg := &Config{ - Tolerations: []v1.Toleration{{ - Key: "reserved", - Operator: "equal", - Value: "value", - Effect: "NoSchedule", - }}, - } - pod := &corev1.Pod{} - - pod = applyPodTolerations(ctx, cfg, pod) - - assert.Equal(t, pod.Spec.Tolerations, cfg.Tolerations) -} - -func TestFormatSubTaskName(t *testing.T) { - ctx := context.Background() - parentName := "foo" - - tests := []struct { - index int - retryAttempt uint64 - want string - }{ - {0, 0, fmt.Sprintf("%v-%v", parentName, 0)}, - {1, 0, fmt.Sprintf("%v-%v", parentName, 1)}, - {0, 1, fmt.Sprintf("%v-%v-%v", parentName, 0, 1)}, - {1, 1, fmt.Sprintf("%v-%v-%v", parentName, 1, 1)}, - } - - for i, tt := range tests { - t.Run(fmt.Sprintf("format-subtask-name-%v", i), func(t *testing.T) { - assert.Equal(t, tt.want, formatSubTaskName(ctx, parentName, tt.index, tt.retryAttempt)) - }) - } -} diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go new file mode 100644 index 000000000..b43d66a1f --- /dev/null +++ b/go/tasks/plugins/array/k8s/management.go @@ -0,0 +1,276 @@ +package k8s + +import ( + "context" + "fmt" + "strings" + + idlCore "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" + + "github.com/flyteorg/flyteplugins/go/tasks/errors" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" + "github.com/flyteorg/flyteplugins/go/tasks/plugins/array" + "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/arraystatus" + arrayCore "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/core" + "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/errorcollector" + + "github.com/flyteorg/flytestdlib/bitarray" + "github.com/flyteorg/flytestdlib/logger" + "github.com/flyteorg/flytestdlib/storage" + + k8serrors "k8s.io/apimachinery/pkg/api/errors" +) + +func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) (core.AllocationStatus, error) { + if !IsResourceConfigSet(config.ResourceConfig) { + return core.AllocationStatusGranted, nil + } + + resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) + resourceConstraintSpec := core.ResourceConstraintsSpec{ + ProjectScopeResourceConstraint: nil, + NamespaceScopeResourceConstraint: nil, + } + + allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstraintSpec) + if err != nil { + return core.AllocationUndefined, err + } + + return allocationStatus, nil +} + +func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) error { + if !IsResourceConfigSet(config.ResourceConfig) { + return nil + } + resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) + + err := tCtx.ResourceManager().ReleaseResource(ctx, resourceNamespace, podName) + if err != nil { + logger.Errorf(ctx, "Error releasing token [%s]. error %s", podName, err) + return err + } + + return nil +} + +func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, + config *Config, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference, currentState *arrayCore.State) ( + newState *arrayCore.State, logLinks []*idlCore.TaskLog, subTaskIDs []*string, err error) { + if int64(currentState.GetExecutionArraySize()) > config.MaxArrayJobSize { + ee := fmt.Errorf("array size > max allowed. Requested [%v]. Allowed [%v]", currentState.GetExecutionArraySize(), config.MaxArrayJobSize) + logger.Info(ctx, ee) + currentState = currentState.SetPhase(arrayCore.PhasePermanentFailure, 0).SetReason(ee.Error()) + return currentState, logLinks, subTaskIDs, nil + } + + logLinks = make([]*idlCore.TaskLog, 0, 4) + newState = currentState + messageCollector := errorcollector.NewErrorMessageCollector() + newArrayStatus := &arraystatus.ArrayStatus{ + Summary: arraystatus.ArraySummary{}, + Detailed: arrayCore.NewPhasesCompactArray(uint(currentState.GetExecutionArraySize())), + } + subTaskIDs = make([]*string, 0, len(currentState.GetArrayStatus().Detailed.GetItems())) + + // If we have arrived at this state for the first time then currentState has not been + // initialized with number of sub tasks. + if len(currentState.GetArrayStatus().Detailed.GetItems()) == 0 { + currentState.ArrayStatus = *newArrayStatus + } + + // If the current State is newly minted then we must initialize RetryAttempts to track how many + // times each subtask is executed. + if len(currentState.RetryAttempts.GetItems()) == 0 { + count := uint(currentState.GetExecutionArraySize()) + maxValue := bitarray.Item(tCtx.TaskExecutionMetadata().GetMaxAttempts()) + + retryAttemptsArray, err := bitarray.NewCompactArray(count, maxValue) + if err != nil { + logger.Errorf(context.Background(), "Failed to create attempts compact array with [count: %v, maxValue: %v]", count, maxValue) + return currentState, logLinks, subTaskIDs, nil + } + + // Initialize subtask retryAttempts to 0 so that, in tandem with the podName logic, we + // maintain backwards compatability. + for i := 0; i < currentState.GetExecutionArraySize(); i++ { + retryAttemptsArray.SetItem(i, 0) + } + + currentState.RetryAttempts = retryAttemptsArray + } + + // identify max parallelism + taskTemplate, err := tCtx.TaskReader().Read(ctx) + if err != nil { + return currentState, logLinks, subTaskIDs, err + } else if taskTemplate == nil { + return currentState, logLinks, subTaskIDs, errors.Errorf(errors.BadTaskSpecification, "Required value not set, taskTemplate is nil") + } + + arrayJob, err := arrayCore.ToArrayJob(taskTemplate.GetCustom(), taskTemplate.TaskTypeVersion) + if err != nil { + return currentState, logLinks, subTaskIDs, err + } + + currentParallelism := 0 + maxParallelism := int(arrayJob.Parallelism) + + for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() { + existingPhase := core.Phases[existingPhaseIdx] + retryAttempt := currentState.RetryAttempts.GetItem(childIdx) + + if existingPhase == core.PhaseRetryableFailure { + retryAttempt++ + newState.RetryAttempts.SetItem(childIdx, retryAttempt) + } else if existingPhase.IsTerminal() { + continue + } + + originalIdx := arrayCore.CalculateOriginalIndex(childIdx, newState.GetIndexesToCache()) + stCtx := newSubTaskExecutionContext(tCtx, taskTemplate, originalIdx, retryAttempt) + podName := stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName() + + if existingPhase == core.PhaseUndefined || existingPhase == core.PhaseWaitingForResources || existingPhase == core.PhaseRetryableFailure { + // attempt to allocateResource + allocationStatus, err := allocateResource(ctx, stCtx, config, podName) + if err != nil { + logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", + stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) + return currentState, logLinks, subTaskIDs, err + } + + logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) + if allocationStatus != core.AllocationStatusGranted { + newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseWaitingForResources)) + continue + } + + // create subtask + launchSubtask(ctx, stCtx, kubeClient) + if err != nil && !k8serrors.IsAlreadyExists(err) { + if k8serrors.IsForbidden(err) { + if strings.Contains(err.Error(), "exceeded quota") { + // TODO: Quota errors are retried forever, it would be good to have support for backoff strategy. + logger.Infof(ctx, "Failed to launch job, resource quota exceeded. Err: %v", err) + newState = newState.SetPhase(arrayCore.PhaseWaitingForResources, 0).SetReason("Not enough resources to launch job") + } else { + newState = newState.SetPhase(arrayCore.PhaseRetryableFailure, 0).SetReason("Failed to launch job.") + } + + newState.SetReason(err.Error()) + return newState, logLinks, subTaskIDs, nil + } + + return currentState, logLinks, subTaskIDs, err + } + } + + // monitor pod + phaseInfo, err := getSubtaskPhaseInfo(ctx, stCtx, kubeClient) + if err != nil { + return currentState, logLinks, subTaskIDs, err + } + + if phaseInfo.Err() != nil { + messageCollector.Collect(childIdx, phaseInfo.Err().String()) + } + + subTaskIDs = append(subTaskIDs, &podName) + if phaseInfo.Info() != nil { + logLinks = append(logLinks, phaseInfo.Info().Logs...) + } + + // process subtask phase + actualPhase := phaseInfo.Phase() + if actualPhase.IsSuccess() { + actualPhase, err = array.CheckTaskOutput(ctx, dataStore, outputPrefix, baseOutputDataSandbox, childIdx, originalIdx) + if err != nil { + return currentState, logLinks, subTaskIDs, err + } + } + + if actualPhase == core.PhaseRetryableFailure && uint32(retryAttempt+1) >= stCtx.TaskExecutionMetadata().GetMaxAttempts() { + // If we see a retryable failure we must check if the number of retries exceeds the maximum + // attempts. If so, transition to a permanent failure so that is not attempted again. + newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhasePermanentFailure)) + } else { + newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(actualPhase)) + } + + if actualPhase.IsTerminal() { + err = deallocateResource(ctx, stCtx, config, podName) + if err != nil { + logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) + return currentState, logLinks, subTaskIDs, err + } + } + + // validate parallelism + if !actualPhase.IsTerminal() || actualPhase == core.PhaseRetryableFailure { + currentParallelism++ + } + + if maxParallelism != 0 && currentParallelism >= maxParallelism { + break + } + } + + // compute task phase from array status summary + for _, phaseIdx := range newArrayStatus.Detailed.GetItems() { + newArrayStatus.Summary.Inc(core.Phases[phaseIdx]) + } + + phase := arrayCore.SummaryToPhase(ctx, currentState.GetOriginalMinSuccesses()-currentState.GetOriginalArraySize()+int64(currentState.GetExecutionArraySize()), newArrayStatus.Summary) + + // process new state + newState = newState.SetArrayStatus(*newArrayStatus) + if phase == arrayCore.PhaseWriteToDiscoveryThenFail { + errorMsg := messageCollector.Summary(GetConfig().MaxErrorStringLength) + newState = newState.SetReason(errorMsg) + } + + if phase == arrayCore.PhaseCheckingSubTaskExecutions { + newPhaseVersion := uint32(0) + + // For now, the only changes to PhaseVersion and PreviousSummary occur for running array jobs. + for phase, count := range newState.GetArrayStatus().Summary { + newPhaseVersion += uint32(phase) * uint32(count) + } + + newState = newState.SetPhase(phase, newPhaseVersion).SetReason("Task is still running.") + } else { + newState = newState.SetPhase(phase, core.DefaultPhaseVersion) + } + + return newState, logLinks, subTaskIDs, nil +} + +func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, currentState *arrayCore.State) error { + // TODO - fix + /*size := currentState.GetExecutionArraySize() + messageCollector := errorcollector.NewErrorMessageCollector() + for childIdx := 0; childIdx < size; childIdx++ { + task := Task{ + ChildIdx: childIdx, + Config: config, + State: currentState, + } + + err := task.Abort(ctx, tCtx, kubeClient) + if err != nil { + messageCollector.Collect(childIdx, err.Error()) + } + err = task.Finalize(ctx, tCtx, kubeClient) + if err != nil { + messageCollector.Collect(childIdx, err.Error()) + } + } + + if errs.Length() > 0 { + return fmt.Errorf(errs.Summary(config.MaxErrorStringLength)) + }*/ + + return nil +} diff --git a/go/tasks/plugins/array/k8s/monitor_test.go b/go/tasks/plugins/array/k8s/management_test.go similarity index 99% rename from go/tasks/plugins/array/k8s/monitor_test.go rename to go/tasks/plugins/array/k8s/management_test.go index 282e0036f..3311be8d3 100644 --- a/go/tasks/plugins/array/k8s/monitor_test.go +++ b/go/tasks/plugins/array/k8s/management_test.go @@ -81,6 +81,7 @@ func getMockTaskExecutionContext(ctx context.Context, parallelism int) *mocks.Ta tMeta.OnGetOverrides().Return(overrides) tMeta.OnIsInterruptible().Return(false) tMeta.OnGetK8sServiceAccount().Return("s") + tMeta.OnGetSecurityContext().Return(core2.SecurityContext{}) tMeta.OnGetMaxAttempts().Return(2) tMeta.OnGetNamespace().Return("n") @@ -108,14 +109,14 @@ func getMockTaskExecutionContext(ctx context.Context, parallelism int) *mocks.Ta return tCtx } -func TestGetNamespaceForExecution(t *testing.T) { +/*func TestGetNamespaceForExecution(t *testing.T) { ctx := context.Background() tCtx := getMockTaskExecutionContext(ctx, 0) assert.Equal(t, GetNamespaceForExecution(tCtx, ""), tCtx.TaskExecutionMetadata().GetNamespace()) assert.Equal(t, GetNamespaceForExecution(tCtx, "abcd"), "abcd") assert.Equal(t, GetNamespaceForExecution(tCtx, "a-{{.namespace}}-b"), fmt.Sprintf("a-%s-b", tCtx.TaskExecutionMetadata().GetNamespace())) -} +}*/ func testSubTaskIDs(t *testing.T, actual []*string) { var expected = make([]*string, 5) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go deleted file mode 100644 index 55f42324e..000000000 --- a/go/tasks/plugins/array/k8s/monitor.go +++ /dev/null @@ -1,348 +0,0 @@ -package k8s - -import ( - "context" - "fmt" - "time" - - idlCore "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" - - "github.com/flyteorg/flyteplugins/go/tasks/errors" - "github.com/flyteorg/flyteplugins/go/tasks/logs" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog" - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/arraystatus" - arrayCore "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/core" - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/errorcollector" - - "github.com/flyteorg/flytestdlib/bitarray" - "github.com/flyteorg/flytestdlib/logger" - "github.com/flyteorg/flytestdlib/storage" - - v1 "k8s.io/api/core/v1" - k8serrors "k8s.io/apimachinery/pkg/api/errors" - metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" - k8sTypes "k8s.io/apimachinery/pkg/types" - - errors2 "github.com/flyteorg/flytestdlib/errors" -) - -const ( - ErrCheckPodStatus errors2.ErrorCode = "CHECK_POD_FAILED" -) - -func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, - config *Config, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference, currentState *arrayCore.State) ( - newState *arrayCore.State, logLinks []*idlCore.TaskLog, subTaskIDs []*string, err error) { - if int64(currentState.GetExecutionArraySize()) > config.MaxArrayJobSize { - ee := fmt.Errorf("array size > max allowed. Requested [%v]. Allowed [%v]", currentState.GetExecutionArraySize(), config.MaxArrayJobSize) - logger.Info(ctx, ee) - currentState = currentState.SetPhase(arrayCore.PhasePermanentFailure, 0).SetReason(ee.Error()) - return currentState, logLinks, subTaskIDs, nil - } - - logLinks = make([]*idlCore.TaskLog, 0, 4) - newState = currentState - msg := errorcollector.NewErrorMessageCollector() - newArrayStatus := &arraystatus.ArrayStatus{ - Summary: arraystatus.ArraySummary{}, - Detailed: arrayCore.NewPhasesCompactArray(uint(currentState.GetExecutionArraySize())), - } - subTaskIDs = make([]*string, 0, len(currentState.GetArrayStatus().Detailed.GetItems())) - - // If we have arrived at this state for the first time then currentState has not been - // initialized with number of sub tasks. - if len(currentState.GetArrayStatus().Detailed.GetItems()) == 0 { - currentState.ArrayStatus = *newArrayStatus - } - - // If the current State is newly minted then we must initialize RetryAttempts to track how many - // times each subtask is executed. - if len(currentState.RetryAttempts.GetItems()) == 0 { - count := uint(currentState.GetExecutionArraySize()) - maxValue := bitarray.Item(tCtx.TaskExecutionMetadata().GetMaxAttempts()) - - retryAttemptsArray, err := bitarray.NewCompactArray(count, maxValue) - if err != nil { - logger.Errorf(context.Background(), "Failed to create attempts compact array with [count: %v, maxValue: %v]", count, maxValue) - return currentState, logLinks, subTaskIDs, nil - } - - // Set subtask retryAttempts using the existing task context retry attempt. For new tasks - // this will initialize to 0, but running tasks will use the existing retry attempt. - retryAttempt := bitarray.Item(tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID().RetryAttempt) - for i := 0; i < currentState.GetExecutionArraySize(); i++ { - retryAttemptsArray.SetItem(i, retryAttempt) - } - - currentState.RetryAttempts = retryAttemptsArray - } - - logPlugin, err := logs.InitializeLogPlugins(&config.LogConfig.Config) - if err != nil { - logger.Errorf(ctx, "Error initializing LogPlugins: [%s]", err) - return currentState, logLinks, subTaskIDs, err - } - - // identify max parallelism - taskTemplate, err := tCtx.TaskReader().Read(ctx) - if err != nil { - return currentState, logLinks, subTaskIDs, err - } else if taskTemplate == nil { - return currentState, logLinks, subTaskIDs, errors.Errorf(errors.BadTaskSpecification, "Required value not set, taskTemplate is nil") - } - - arrayJob, err := arrayCore.ToArrayJob(taskTemplate.GetCustom(), taskTemplate.TaskTypeVersion) - if err != nil { - return currentState, logLinks, subTaskIDs, err - } - - currentParallelism := 0 - maxParallelism := int(arrayJob.Parallelism) - - for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() { - existingPhase := core.Phases[existingPhaseIdx] - originalIdx := arrayCore.CalculateOriginalIndex(childIdx, newState.GetIndexesToCache()) - - retryAttempt := currentState.RetryAttempts.GetItem(childIdx) - podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), childIdx, retryAttempt) - - if existingPhase.IsTerminal() { - // If we get here it means we have already "processed" this terminal phase since we will only persist - // the phase after all processing is done (e.g. check outputs/errors file, record events... etc.). - - // Since we know we have already "processed" this terminal phase we can safely deallocate resource - err = deallocateResource(ctx, tCtx, config, podName) - if err != nil { - logger.Errorf(ctx, "Error releasing allocation token [%s] in LaunchAndCheckSubTasks [%s]", podName, err) - return currentState, logLinks, subTaskIDs, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") - } - - // If a subtask is marked as a retryable failure we check if the number of retries - // exceeds the maximum attempts. If so, transition the task to a permanent failure - // so that is not attempted again. If it can be retried, increment the retry attempts - // value and transition the task to "Undefined" so that it is reevaluated. - if existingPhase == core.PhaseRetryableFailure { - if uint32(retryAttempt+1) < tCtx.TaskExecutionMetadata().GetMaxAttempts() { - newState.RetryAttempts.SetItem(childIdx, retryAttempt+1) - - newArrayStatus.Summary.Inc(core.PhaseUndefined) - newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseUndefined)) - continue - } else { - existingPhase = core.PhasePermanentFailure - } - } - - newArrayStatus.Summary.Inc(existingPhase) - newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(existingPhase)) - - phaseInfo, err := FetchPodStatusAndLogs(ctx, kubeClient, - k8sTypes.NamespacedName{ - Name: podName, - Namespace: GetNamespaceForExecution(tCtx, config.NamespaceTemplate), - }, - originalIdx, - tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID().RetryAttempt, - retryAttempt, - logPlugin) - - if err != nil { - return currentState, logLinks, subTaskIDs, err - } - - if phaseInfo.Info() != nil { - logLinks = append(logLinks, phaseInfo.Info().Logs...) - } - - continue - } - - task := &Task{ - State: newState, - NewArrayStatus: newArrayStatus, - Config: config, - ChildIdx: childIdx, - OriginalIndex: originalIdx, - MessageCollector: &msg, - SubTaskIDs: subTaskIDs, - } - - // The first time we enter this state we will launch every subtask. On subsequent rounds, the pod - // has already been created so we return a Success value and continue with the Monitor step. - var launchResult LaunchResult - launchResult, err = task.Launch(ctx, tCtx, kubeClient) - if err != nil { - logger.Errorf(ctx, "K8s array - Launch error %v", err) - return currentState, logLinks, subTaskIDs, err - } - - switch launchResult { - case LaunchSuccess: - // Continue with execution if successful - case LaunchError: - return currentState, logLinks, subTaskIDs, err - // If Resource manager is enabled and there are currently not enough resources we can skip this round - // for a subtask and wait until there are enough resources. - case LaunchWaiting: - continue - case LaunchReturnState: - return currentState, logLinks, subTaskIDs, nil - } - - monitorResult, taskLogs, err := task.Monitor(ctx, tCtx, kubeClient, dataStore, outputPrefix, baseOutputDataSandbox, logPlugin) - - if len(taskLogs) > 0 { - logLinks = append(logLinks, taskLogs...) - } - subTaskIDs = task.SubTaskIDs - - if monitorResult != MonitorSuccess { - if err != nil { - logger.Errorf(ctx, "K8s array - Monitor error %v", err) - } - return currentState, logLinks, subTaskIDs, err - } - - // validate map task parallelism - newSubtaskPhase := core.Phases[newArrayStatus.Detailed.GetItem(childIdx)] - if !newSubtaskPhase.IsTerminal() || newSubtaskPhase == core.PhaseRetryableFailure { - currentParallelism++ - } - - if maxParallelism != 0 && currentParallelism >= maxParallelism { - // If max parallelism has been achieved we need to fill the subtask phase summary with - // the remaining subtasks so the overall map task phase can be accurately identified. - for i := childIdx + 1; i < len(currentState.GetArrayStatus().Detailed.GetItems()); i++ { - childSubtaskPhase := core.Phases[newArrayStatus.Detailed.GetItem(i)] - newArrayStatus.Summary.Inc(childSubtaskPhase) - } - - break - } - } - - newState = newState.SetArrayStatus(*newArrayStatus) - - phase := arrayCore.SummaryToPhase(ctx, currentState.GetOriginalMinSuccesses()-currentState.GetOriginalArraySize()+int64(currentState.GetExecutionArraySize()), newArrayStatus.Summary) - if phase == arrayCore.PhaseWriteToDiscoveryThenFail { - errorMsg := msg.Summary(GetConfig().MaxErrorStringLength) - newState = newState.SetReason(errorMsg) - } - - if phase == arrayCore.PhaseCheckingSubTaskExecutions { - newPhaseVersion := uint32(0) - - // For now, the only changes to PhaseVersion and PreviousSummary occur for running array jobs. - for phase, count := range newState.GetArrayStatus().Summary { - newPhaseVersion += uint32(phase) * uint32(count) - } - - newState = newState.SetPhase(phase, newPhaseVersion).SetReason("Task is still running.") - } else { - newState = newState.SetPhase(phase, core.DefaultPhaseVersion) - } - - return newState, logLinks, subTaskIDs, nil -} - -func FetchPodStatusAndLogs(ctx context.Context, client core.KubeClient, name k8sTypes.NamespacedName, index int, retryAttempt uint32, subtaskRetryAttempt uint64, logPlugin tasklog.Plugin) ( - info core.PhaseInfo, err error) { - - pod := &v1.Pod{ - TypeMeta: metaV1.TypeMeta{ - Kind: PodKind, - APIVersion: v1.SchemeGroupVersion.String(), - }, - } - - err = client.GetClient().Get(ctx, name, pod) - now := time.Now() - - if err != nil { - if k8serrors.IsNotFound(err) { - // If the object disappeared at this point, it means it was manually removed or garbage collected. - // Mark it as a failure. - return core.PhaseInfoFailed(core.PhaseRetryableFailure, &idlCore.ExecutionError{ - Code: string(k8serrors.ReasonForError(err)), - Message: err.Error(), - Kind: idlCore.ExecutionError_SYSTEM, - }, &core.TaskInfo{ - OccurredAt: &now, - }), nil - } - - return info, err - } - - t := flytek8s.GetLastTransitionOccurredAt(pod).Time - taskInfo := core.TaskInfo{ - OccurredAt: &t, - } - - if pod.Status.Phase != v1.PodPending && pod.Status.Phase != v1.PodUnknown { - // We append the subtaskRetryAttempt to the log name only when it is > 0 to ensure backwards - // compatibility when dynamically transitioning running map tasks to use subtask retry attempts. - var logName string - if subtaskRetryAttempt == 0 { - logName = fmt.Sprintf(" #%d-%d", retryAttempt, index) - } else { - logName = fmt.Sprintf(" #%d-%d-%d", retryAttempt, index, subtaskRetryAttempt) - } - - if logPlugin != nil { - o, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: pod.Name, - Namespace: pod.Namespace, - LogName: logName, - PodUnixStartTime: pod.CreationTimestamp.Unix(), - }) - - if err != nil { - return core.PhaseInfoUndefined, err - } - taskInfo.Logs = o.TaskLogs - } - } - - var phaseInfo core.PhaseInfo - var err2 error - - switch pod.Status.Phase { - case v1.PodSucceeded: - phaseInfo, err2 = flytek8s.DemystifySuccess(pod.Status, taskInfo) - case v1.PodFailed: - phaseInfo, err2 = flytek8s.DemystifyFailure(pod.Status, taskInfo) - case v1.PodPending: - phaseInfo, err2 = flytek8s.DemystifyPending(pod.Status) - case v1.PodUnknown: - phaseInfo = core.PhaseInfoUndefined - default: - primaryContainerName, ok := pod.GetAnnotations()[primaryContainerKey] - if ok { - // Special handling for determining the phase of an array job for a Pod task. - phaseInfo = flytek8s.DeterminePrimaryContainerPhase(primaryContainerName, pod.Status.ContainerStatuses, &taskInfo) - if phaseInfo.Phase() == core.PhaseRunning && len(taskInfo.Logs) > 0 { - return core.PhaseInfoRunning(core.DefaultPhaseVersion+1, phaseInfo.Info()), nil - } - return phaseInfo, nil - } - - if len(taskInfo.Logs) > 0 { - phaseInfo = core.PhaseInfoRunning(core.DefaultPhaseVersion+1, &taskInfo) - } else { - phaseInfo = core.PhaseInfoRunning(core.DefaultPhaseVersion, &taskInfo) - } - } - - if err2 == nil && phaseInfo.Info() != nil { - // Append sub-job status in Log Name for viz. - for _, log := range phaseInfo.Info().Logs { - log.Name += fmt.Sprintf(" (%s)", phaseInfo.Phase().String()) - } - } - - return phaseInfo, err2 - -} diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go new file mode 100644 index 000000000..34c27994e --- /dev/null +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -0,0 +1,180 @@ +package k8s + +import ( + "context" + "fmt" + "regexp" + "strconv" + + pluginsCore "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s/config" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils" + podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" + + errors2 "github.com/flyteorg/flytestdlib/errors" + "github.com/flyteorg/flytestdlib/logger" + + v1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8stypes "k8s.io/apimachinery/pkg/types" +) + +const ( + ErrBuildPodTemplate errors2.ErrorCode = "POD_TEMPLATE_FAILED" + ErrReplaceCmdTemplate errors2.ErrorCode = "CMD_TEMPLATE_FAILED" + FlyteK8sArrayIndexVarName string = "FLYTE_K8S_ARRAY_INDEX" + finalizer string = "flyte/array" + JobIndexVarName string = "BATCH_JOB_ARRAY_INDEX_VAR_NAME" +) + +var ( + arrayJobEnvVars = []v1.EnvVar{ + { + Name: JobIndexVarName, + Value: FlyteK8sArrayIndexVarName, + }, + } + namespaceRegex = regexp.MustCompile("(?i){{.namespace}}(?i)") +) + +func addMetadata(stCtx SubTaskExecutionContext, pod *v1.Pod) { + cfg := GetConfig() + k8sPluginCfg := config.GetK8sPluginConfig() + taskExecutionMetadata := stCtx.TaskExecutionMetadata() + + // Default to parent namespace + namespace := taskExecutionMetadata.GetNamespace() + if cfg.NamespaceTemplate != "" { + if namespaceRegex.MatchString(cfg.NamespaceTemplate) { + namespace = namespaceRegex.ReplaceAllString(cfg.NamespaceTemplate, namespace) + } else { + namespace = cfg.NamespaceTemplate + } + } + + pod.SetNamespace(namespace) + pod.SetAnnotations(utils.UnionMaps(k8sPluginCfg.DefaultAnnotations, pod.GetAnnotations(), utils.CopyMap(taskExecutionMetadata.GetAnnotations()))) + pod.SetLabels(utils.UnionMaps(pod.GetLabels(), utils.CopyMap(taskExecutionMetadata.GetLabels()), k8sPluginCfg.DefaultLabels)) + pod.SetName(taskExecutionMetadata.GetTaskExecutionID().GetGeneratedName()) + + if !cfg.RemoteClusterConfig.Enabled { + pod.OwnerReferences = []metav1.OwnerReference{taskExecutionMetadata.GetOwnerReference()} + } + + if k8sPluginCfg.InjectFinalizer { + f := append(pod.GetFinalizers(), finalizer) + pod.SetFinalizers(f) + } + + if len(cfg.DefaultScheduler) > 0 { + pod.Spec.SchedulerName = cfg.DefaultScheduler + } + + // TODO - should these be appends? or left as overrides? + if len(cfg.NodeSelector) != 0 { + pod.Spec.NodeSelector = cfg.NodeSelector + } + if len(cfg.Tolerations) != 0 { + pod.Spec.Tolerations = cfg.Tolerations + } +} + +func abortSubtask() error { + // TODO + return nil +} + +func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, kubeClient pluginsCore.KubeClient) error { + o, err := podPlugin.DefaultPodPlugin.BuildResource(ctx, stCtx) + pod := o.(*v1.Pod) + if err != nil { + return err + } + + addMetadata(stCtx, pod) + + // inject maptask specific container environment variables + if len(pod.Spec.Containers) == 0 { + return errors2.Wrapf(ErrReplaceCmdTemplate, err, "No containers found in podSpec.") + } + + containerIndex, err := getTaskContainerIndex(pod) + if err != nil { + return err + } + + pod.Spec.Containers[containerIndex].Env = append(pod.Spec.Containers[containerIndex].Env, v1.EnvVar{ + Name: FlyteK8sArrayIndexVarName, + // Use the OriginalIndex which represents the position of the subtask in the original user's map task before + // compacting indexes caused by catalog-cache-check. + Value: strconv.Itoa(stCtx.originalIndex), + }) + + pod.Spec.Containers[containerIndex].Env = append(pod.Spec.Containers[containerIndex].Env, arrayJobEnvVars...) + + return kubeClient.GetClient().Create(ctx, pod) +} + +func finalizeSubtask() error { + // TODO + return nil +} + +func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, kubeClient pluginsCore.KubeClient) (pluginsCore.PhaseInfo, error) { + o, err := podPlugin.DefaultPodPlugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) + if err != nil { + return pluginsCore.PhaseInfoUndefined, err + } + + pod := o.(*v1.Pod) + logger.Infof(ctx, "BEFORE: %v", pod) + addMetadata(stCtx, pod) + + // Attempt to get resource from informer cache, if not found, retrieve it from API server. + nsName := k8stypes.NamespacedName{Namespace: pod.GetNamespace(), Name: pod.GetName()} + if err := kubeClient.GetClient().Get(ctx, nsName, pod); err != nil { + if isK8sObjectNotExists(err) { + // This happens sometimes because a node gets removed and K8s deletes the pod. This will result in a + // Pod does not exist error. This should be retried using the retry policy + //logger.Warningf(ctx, "Failed to find the Resource with name: %v. Error: %v", nsName, err) // TODO - log + failureReason := fmt.Sprintf("resource not found, name [%s]. reason: %s", nsName.String(), err.Error()) + //return pluginsCore.DoTransition(pluginsCore.PhaseInfoSystemRetryableFailure("ResourceDeletedExternally", failureReason, nil)), nil + + // TODO - validate? + // return pluginsCore.PhaseInfoUndefined, err? + return pluginsCore.PhaseInfoSystemRetryableFailure("ResourceDeletedExternally", failureReason, nil), err + } + + //logger.Warningf(ctx, "Failed to retrieve Resource Details with name: %v. Error: %v", nsName, err) + // TODO - validate? + return pluginsCore.PhaseInfoUndefined, err + } + + logger.Infof(ctx, "AFTER: %v", pod) + return podPlugin.DefaultPodPlugin.GetTaskPhase(ctx, stCtx, pod) +} + +func getTaskContainerIndex(pod *v1.Pod) (int, error) { + primaryContainerName, ok := pod.Annotations[podPlugin.PrimaryContainerKey] + // For tasks with a Container target, we only ever build one container as part of the pod + if !ok { + if len(pod.Spec.Containers) == 1 { + return 0, nil + } + // For tasks with a K8sPod task target, they may produce multiple containers but at least one must be the designated primary. + return -1, errors2.Errorf(ErrBuildPodTemplate, "Expected a specified primary container key when building an array job with a K8sPod spec target") + + } + + for idx, container := range pod.Spec.Containers { + if container.Name == primaryContainerName { + return idx, nil + } + } + return -1, errors2.Errorf(ErrBuildPodTemplate, "Couldn't find any container matching the primary container key when building an array job with a K8sPod spec target") +} + +func isK8sObjectNotExists(err error) bool { + return k8serrors.IsNotFound(err) || k8serrors.IsGone(err) || k8serrors.IsResourceExpired(err) +} diff --git a/go/tasks/plugins/array/k8s/subtask_exec_context.go b/go/tasks/plugins/array/k8s/subtask_exec_context.go new file mode 100644 index 000000000..85df5d00c --- /dev/null +++ b/go/tasks/plugins/array/k8s/subtask_exec_context.go @@ -0,0 +1,150 @@ +package k8s + +import ( + "context" + "fmt" + "strconv" + + "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" + + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/io" + pluginsCore "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils" + "github.com/flyteorg/flyteplugins/go/tasks/plugins/array" + podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" +) + +// TaskExecutionContext provides a layer on top of core TaskExecutionContext with a custom TaskExecutionMetadata. +type SubTaskExecutionContext struct { + pluginsCore.TaskExecutionContext + arrayInputReader io.InputReader + metadataOverride pluginsCore.TaskExecutionMetadata + originalIndex int + retryAttempt uint64 + subtaskReader SubTaskReader +} + +// InputReader overrides the TaskExecutionContext from base and returns a specialized context for Array +func (s SubTaskExecutionContext) InputReader() io.InputReader { + return s.arrayInputReader +} + +func (s SubTaskExecutionContext) TaskExecutionMetadata() pluginsCore.TaskExecutionMetadata { + return s.metadataOverride +} + +func (s SubTaskExecutionContext) TaskReader() pluginsCore.TaskReader { + return s.subtaskReader +} + +func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTemplate *core.TaskTemplate, originalIndex int, retryAttempt uint64) SubTaskExecutionContext { + arrayInputReader := array.GetInputReader(tCtx, taskTemplate) + taskExecutionMetadata := tCtx.TaskExecutionMetadata() + taskExecutionID := taskExecutionMetadata.GetTaskExecutionID() + metadataOverride := SubTaskExecutionMetadata{ + taskExecutionMetadata, + SubTaskExecutionID{ + taskExecutionID, + originalIndex, + taskExecutionID.GetGeneratedName(), + retryAttempt, + }, + } + + subtaskTemplate := &core.TaskTemplate{} + //var subtaskTemplate *core.TaskTemplate + *subtaskTemplate = *taskTemplate + + if subtaskTemplate != nil { + subtaskTemplate.TaskTypeVersion = 2 + if subtaskTemplate.GetContainer() != nil { + subtaskTemplate.Type = podPlugin.ContainerTaskType + } else if taskTemplate.GetK8SPod() != nil { + subtaskTemplate.Type = podPlugin.SidecarTaskType + } + } + + subtaskReader := SubTaskReader{tCtx.TaskReader(), subtaskTemplate} + + return SubTaskExecutionContext{ + TaskExecutionContext: tCtx, + arrayInputReader: arrayInputReader, + metadataOverride: metadataOverride, + originalIndex: originalIndex, + retryAttempt: retryAttempt, + subtaskReader: subtaskReader, + } +} + +type SubTaskReader struct { + pluginsCore.TaskReader + subtaskTemplate *core.TaskTemplate +} + +func (s SubTaskReader) Read(ctx context.Context) (*core.TaskTemplate, error) { + return s.subtaskTemplate, nil +} + +//s.stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName() +type SubTaskExecutionID struct { + pluginsCore.TaskExecutionID + originalIndex int + parentName string + retryAttempt uint64 +} + +func (s SubTaskExecutionID) GetGeneratedName() string { + indexStr := strconv.Itoa(s.originalIndex) + + // If the retryAttempt is 0 we do not include it in the pod name. The gives us backwards + // compatibility in the ability to dynamically transition running map tasks to use subtask retries. + if s.retryAttempt == 0 { + return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v", s.parentName, indexStr)) + } + + retryAttemptStr := strconv.FormatUint(s.retryAttempt, 10) + return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v-%v", s.parentName, indexStr, retryAttemptStr)) +} + +// TODO hamersaw - enable secrets +// TaskExecutionMetadata provides a layer on top of the core TaskExecutionMetadata with customized annotations and labels +// for k8s plugins. +type SubTaskExecutionMetadata struct { + pluginsCore.TaskExecutionMetadata + + subtaskExecutionID SubTaskExecutionID + //annotations map[string]string + //labels map[string]string +} + +/*func (t TaskExecutionMetadata) GetLabels() map[string]string { + return t.labels +} + +func (t TaskExecutionMetadata) GetAnnotations() map[string]string { + return t.annotations +} + +// newTaskExecutionMetadata creates a TaskExecutionMetadata with secrets serialized as annotations and a label added +// to trigger the flyte pod webhook +func newTaskExecutionMetadata(tCtx pluginsCore.TaskExecutionMetadata, taskTmpl *core.TaskTemplate) (TaskExecutionMetadata, error) { + var err error + secretsMap := make(map[string]string) + injectSecretsLabel := make(map[string]string) + if taskTmpl.SecurityContext != nil && len(taskTmpl.SecurityContext.Secrets) > 0 { + secretsMap, err = secrets.MarshalSecretsToMapStrings(taskTmpl.SecurityContext.Secrets) + if err != nil { + return TaskExecutionMetadata{}, err + } + + injectSecretsLabel = map[string]string{ + secrets.PodLabel: secrets.PodLabelValue, + } + } + + return TaskExecutionMetadata{ + TaskExecutionMetadata: tCtx, + annotations: utils.UnionMaps(tCtx.GetAnnotations(), secretsMap), + labels: utils.UnionMaps(tCtx.GetLabels(), injectSecretsLabel), + }, nil +}*/ diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go deleted file mode 100644 index f29baeae5..000000000 --- a/go/tasks/plugins/array/k8s/task.go +++ /dev/null @@ -1,331 +0,0 @@ -package k8s - -import ( - "context" - "strconv" - "strings" - - metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" - - idlCore "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array" - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/arraystatus" - arrayCore "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/core" - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/errorcollector" - "github.com/flyteorg/flytestdlib/bitarray" - errors2 "github.com/flyteorg/flytestdlib/errors" - "github.com/flyteorg/flytestdlib/logger" - "github.com/flyteorg/flytestdlib/storage" - corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" - k8serrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - k8sTypes "k8s.io/apimachinery/pkg/types" -) - -type Task struct { - State *arrayCore.State - NewArrayStatus *arraystatus.ArrayStatus - Config *Config - ChildIdx int - OriginalIndex int - MessageCollector *errorcollector.ErrorMessageCollector - SubTaskIDs []*string -} - -type LaunchResult int8 -type MonitorResult int8 - -const ( - LaunchSuccess LaunchResult = iota - LaunchError - LaunchWaiting - LaunchReturnState -) - -const finalizer = "flyte/array" - -func addPodFinalizer(pod *corev1.Pod) *corev1.Pod { - pod.Finalizers = append(pod.Finalizers, finalizer) - return pod -} - -func removeString(list []string, target string) []string { - ret := make([]string, 0) - for _, s := range list { - if s != target { - ret = append(ret, s) - } - } - - return ret -} - -func clearFinalizer(pod *corev1.Pod) *corev1.Pod { - pod.Finalizers = removeString(pod.Finalizers, finalizer) - return pod -} - -const ( - MonitorSuccess MonitorResult = iota - MonitorError -) - -func getTaskContainerIndex(pod *v1.Pod) (int, error) { - primaryContainerName, ok := pod.Annotations[primaryContainerKey] - // For tasks with a Container target, we only ever build one container as part of the pod - if !ok { - if len(pod.Spec.Containers) == 1 { - return 0, nil - } - // For tasks with a K8sPod task target, they may produce multiple containers but at least one must be the designated primary. - return -1, errors2.Errorf(ErrBuildPodTemplate, "Expected a specified primary container key when building an array job with a K8sPod spec target") - - } - - for idx, container := range pod.Spec.Containers { - if container.Name == primaryContainerName { - return idx, nil - } - } - return -1, errors2.Errorf(ErrBuildPodTemplate, "Couldn't find any container matching the primary container key when building an array job with a K8sPod spec target") -} - -func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (LaunchResult, error) { - podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx, t.Config.NamespaceTemplate) - if err != nil { - return LaunchError, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") - } - // Remove owner references for remote cluster execution - if t.Config.RemoteClusterConfig.Enabled { - podTemplate.OwnerReferences = nil - } - - if len(podTemplate.Spec.Containers) == 0 { - return LaunchError, errors2.Wrapf(ErrReplaceCmdTemplate, err, "No containers found in podSpec.") - } - - containerIndex, err := getTaskContainerIndex(&podTemplate) - if err != nil { - return LaunchError, err - } - - podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), t.ChildIdx, t.State.RetryAttempts.GetItem(t.ChildIdx)) - allocationStatus, err := allocateResource(ctx, tCtx, t.Config, podName) - if err != nil { - return LaunchError, err - } - - if allocationStatus != core.AllocationStatusGranted { - t.NewArrayStatus.Detailed.SetItem(t.ChildIdx, bitarray.Item(core.PhaseWaitingForResources)) - t.NewArrayStatus.Summary.Inc(core.PhaseWaitingForResources) - return LaunchWaiting, nil - } - - pod := podTemplate.DeepCopy() - pod.Name = podName - pod.Spec.Containers[containerIndex].Env = append(pod.Spec.Containers[containerIndex].Env, corev1.EnvVar{ - Name: FlyteK8sArrayIndexVarName, - // Use the OriginalIndex which represents the position of the subtask in the original user's map task before - // compacting indexes caused by catalog-cache-check. - Value: strconv.Itoa(t.OriginalIndex), - }) - - pod.Spec.Containers[containerIndex].Env = append(pod.Spec.Containers[containerIndex].Env, arrayJobEnvVars...) - taskTemplate, err := tCtx.TaskReader().Read(ctx) - if err != nil { - return LaunchError, errors2.Wrapf(ErrGetTaskTypeVersion, err, "Unable to read task template") - } else if taskTemplate == nil { - return LaunchError, errors2.Wrapf(ErrGetTaskTypeVersion, err, "Missing task template") - } - - pod = ApplyPodPolicies(ctx, t.Config, pod) - pod = applyNodeSelectorLabels(ctx, t.Config, pod) - pod = applyPodTolerations(ctx, t.Config, pod) - pod = addPodFinalizer(pod) - - // Check for existing pods to prevent unnecessary Resource-Quota usage: https://github.com/kubernetes/kubernetes/issues/76787 - existingPod := &corev1.Pod{} - err = kubeClient.GetCache().Get(ctx, client.ObjectKey{ - Namespace: pod.GetNamespace(), - Name: pod.GetName(), - }, existingPod) - - if err != nil && k8serrors.IsNotFound(err) { - // Attempt creating non-existing pod. - err = kubeClient.GetClient().Create(ctx, pod) - if err != nil && !k8serrors.IsAlreadyExists(err) { - if k8serrors.IsForbidden(err) { - if strings.Contains(err.Error(), "exceeded quota") { - // TODO: Quota errors are retried forever, it would be good to have support for backoff strategy. - logger.Infof(ctx, "Failed to launch job, resource quota exceeded. Err: %v", err) - t.State = t.State.SetPhase(arrayCore.PhaseWaitingForResources, 0).SetReason("Not enough resources to launch job") - } else { - t.State = t.State.SetPhase(arrayCore.PhaseRetryableFailure, 0).SetReason("Failed to launch job.") - } - - t.State.SetReason(err.Error()) - return LaunchReturnState, nil - } - - return LaunchError, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job.") - } - } else if err != nil { - // Another error returned. - logger.Error(ctx, err) - return LaunchError, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job.") - } - - return LaunchSuccess, nil -} - -func (t *Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference, - logPlugin tasklog.Plugin) (MonitorResult, []*idlCore.TaskLog, error) { - retryAttempt := t.State.RetryAttempts.GetItem(t.ChildIdx) - podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), t.ChildIdx, retryAttempt) - t.SubTaskIDs = append(t.SubTaskIDs, &podName) - var loglinks []*idlCore.TaskLog - - // Use original-index for log-name/links - originalIdx := arrayCore.CalculateOriginalIndex(t.ChildIdx, t.State.GetIndexesToCache()) - phaseInfo, err := FetchPodStatusAndLogs(ctx, kubeClient, - k8sTypes.NamespacedName{ - Name: podName, - Namespace: GetNamespaceForExecution(tCtx, t.Config.NamespaceTemplate), - }, - originalIdx, - tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID().RetryAttempt, - retryAttempt, - logPlugin) - if err != nil { - return MonitorError, loglinks, errors2.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status.") - } - - if phaseInfo.Info() != nil { - loglinks = phaseInfo.Info().Logs - } - - if phaseInfo.Err() != nil { - t.MessageCollector.Collect(t.ChildIdx, phaseInfo.Err().String()) - } - - actualPhase := phaseInfo.Phase() - if phaseInfo.Phase().IsSuccess() { - actualPhase, err = array.CheckTaskOutput(ctx, dataStore, outputPrefix, baseOutputDataSandbox, t.ChildIdx, originalIdx) - if err != nil { - return MonitorError, loglinks, err - } - } - - t.NewArrayStatus.Detailed.SetItem(t.ChildIdx, bitarray.Item(actualPhase)) - t.NewArrayStatus.Summary.Inc(actualPhase) - - return MonitorSuccess, loglinks, nil -} - -func (t Task) Abort(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) error { - podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), t.ChildIdx, t.State.RetryAttempts.GetItem(t.ChildIdx)) - pod := &corev1.Pod{ - TypeMeta: metav1.TypeMeta{ - Kind: PodKind, - APIVersion: metav1.SchemeGroupVersion.String(), - }, - ObjectMeta: metav1.ObjectMeta{ - Name: podName, - Namespace: GetNamespaceForExecution(tCtx, t.Config.NamespaceTemplate), - }, - } - - err := kubeClient.GetClient().Delete(ctx, pod) - if err != nil { - if k8serrors.IsNotFound(err) { - - return nil - } - return err - } - - return nil - -} - -func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) error { - podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), t.ChildIdx, t.State.RetryAttempts.GetItem(t.ChildIdx)) - - pod := &v1.Pod{ - TypeMeta: metaV1.TypeMeta{ - Kind: PodKind, - APIVersion: v1.SchemeGroupVersion.String(), - }, - } - - err := kubeClient.GetClient().Get(ctx, k8sTypes.NamespacedName{ - Name: podName, - Namespace: GetNamespaceForExecution(tCtx, t.Config.NamespaceTemplate), - }, pod) - - if err != nil { - if !k8serrors.IsNotFound(err) { - logger.Errorf(ctx, "Error fetching pod [%s] in Finalize [%s]", podName, err) - return err - } - } else { - pod = clearFinalizer(pod) - err := kubeClient.GetClient().Update(ctx, pod) - if err != nil { - logger.Errorf(ctx, "Error updating pod finalizer [%s] in Finalize [%s]", podName, err) - return err - } - } - - // Deallocate Resource - err = deallocateResource(ctx, tCtx, t.Config, podName) - if err != nil { - logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) - return err - } - - return nil - -} - -func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) (core.AllocationStatus, error) { - if !IsResourceConfigSet(config.ResourceConfig) { - return core.AllocationStatusGranted, nil - } - - resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) - resourceConstraintSpec := core.ResourceConstraintsSpec{ - ProjectScopeResourceConstraint: nil, - NamespaceScopeResourceConstraint: nil, - } - - allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstraintSpec) - if err != nil { - logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", - tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) - return core.AllocationUndefined, err - } - - logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) - return allocationStatus, nil -} - -func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) error { - if !IsResourceConfigSet(config.ResourceConfig) { - return nil - } - resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) - - err := tCtx.ResourceManager().ReleaseResource(ctx, resourceNamespace, podName) - if err != nil { - logger.Errorf(ctx, "Error releasing token [%s]. error %s", podName, err) - return err - } - - return nil -} diff --git a/go/tasks/plugins/array/k8s/task_test.go b/go/tasks/plugins/array/k8s/task_test.go deleted file mode 100644 index 874caedac..000000000 --- a/go/tasks/plugins/array/k8s/task_test.go +++ /dev/null @@ -1,136 +0,0 @@ -package k8s - -import ( - "context" - "testing" - - v1 "k8s.io/api/core/v1" - - "github.com/stretchr/testify/mock" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core/mocks" - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/core" - - "github.com/flyteorg/flytestdlib/bitarray" - - "github.com/stretchr/testify/assert" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestFinalize(t *testing.T) { - ctx := context.Background() - - tCtx := getMockTaskExecutionContext(ctx, 0) - kubeClient := mocks.KubeClient{} - kubeClient.OnGetClient().Return(mocks.NewFakeKubeClient()) - - resourceManager := mocks.ResourceManager{} - podTemplate, _, _ := FlyteArrayJobToK8sPodTemplate(ctx, tCtx, "") - pod := addPodFinalizer(&podTemplate) - pod.Name = formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), 1, 1) - assert.Equal(t, "notfound-1-1", pod.Name) - assert.NoError(t, kubeClient.GetClient().Create(ctx, pod)) - - resourceManager.OnReleaseResourceMatch(mock.Anything, mock.Anything, mock.Anything).Return(nil) - tCtx.OnResourceManager().Return(&resourceManager) - - config := Config{ - MaxArrayJobSize: 100, - ResourceConfig: ResourceConfig{ - PrimaryLabel: "p", - Limit: 10, - }, - } - - retryAttemptsArray, err := bitarray.NewCompactArray(2, 1) - assert.NoError(t, err) - - state := core.State{ - RetryAttempts: retryAttemptsArray, - } - - task := &Task{ - State: &state, - Config: &config, - ChildIdx: 1, - } - - err = task.Finalize(ctx, tCtx, &kubeClient) - assert.NoError(t, err) -} - -func TestGetTaskContainerIndex(t *testing.T) { - t.Run("test container target", func(t *testing.T) { - pod := &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "container", - }, - }, - }, - } - index, err := getTaskContainerIndex(pod) - assert.NoError(t, err) - assert.Equal(t, 0, index) - }) - t.Run("test missing primary container annotation", func(t *testing.T) { - pod := &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "container", - }, - { - Name: "container b", - }, - }, - }, - } - _, err := getTaskContainerIndex(pod) - assert.EqualError(t, err, "[POD_TEMPLATE_FAILED] Expected a specified primary container key when building an array job with a K8sPod spec target") - }) - t.Run("test get primary container index", func(t *testing.T) { - pod := &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "container a", - }, - { - Name: "container b", - }, - { - Name: "container c", - }, - }, - }, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - primaryContainerKey: "container c", - }, - }, - } - index, err := getTaskContainerIndex(pod) - assert.NoError(t, err) - assert.Equal(t, 2, index) - }) - t.Run("specified primary container doesn't exist", func(t *testing.T) { - pod := &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "container a", - }, - }, - }, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - primaryContainerKey: "container c", - }, - }, - } - _, err := getTaskContainerIndex(pod) - assert.EqualError(t, err, "[POD_TEMPLATE_FAILED] Couldn't find any container matching the primary container key when building an array job with a K8sPod spec target") - }) -} diff --git a/go/tasks/plugins/array/k8s/transformer.go b/go/tasks/plugins/array/k8s/transformer.go deleted file mode 100644 index 8b10c54d3..000000000 --- a/go/tasks/plugins/array/k8s/transformer.go +++ /dev/null @@ -1,182 +0,0 @@ -package k8s - -import ( - "context" - "regexp" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core/template" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s/config" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/io" - - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array" - - idlCore "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" - idlPlugins "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins" - "github.com/flyteorg/flyteplugins/go/tasks/errors" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s" - core2 "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/core" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -const PodKind = "pod" -const primaryContainerKey = "primary_container_name" - -var namespaceRegex = regexp.MustCompile("(?i){{.namespace}}(?i)") - -type arrayTaskContext struct { - core.TaskExecutionContext - arrayInputReader io.InputReader -} - -// InputReader overrides the TaskExecutionContext from base and returns a specialized context for Array -func (a *arrayTaskContext) InputReader() io.InputReader { - return a.arrayInputReader -} - -func GetNamespaceForExecution(tCtx core.TaskExecutionContext, namespaceTemplate string) string { - - // Default to parent namespace - namespace := tCtx.TaskExecutionMetadata().GetNamespace() - if namespaceTemplate != "" { - if namespaceRegex.MatchString(namespaceTemplate) { - namespace = namespaceRegex.ReplaceAllString(namespaceTemplate, namespace) - } else { - namespace = namespaceTemplate - } - } - return namespace -} - -// Initializes a pod from an array job task template with a K8sPod set as the task target. -// TODO: This should be removed by end of 2021 (it duplicates the pod plugin logic) once we improve array job handling -// and move it to the node level. See https://github.com/flyteorg/flyte/issues/1131 -func buildPodMapTask(task *idlCore.TaskTemplate, metadata core.TaskExecutionMetadata) (v1.Pod, error) { - if task.GetK8SPod() == nil || task.GetK8SPod().PodSpec == nil { - return v1.Pod{}, errors.Errorf(errors.BadTaskSpecification, "Missing pod spec for task") - } - var podSpec = &v1.PodSpec{} - err := utils.UnmarshalStructToObj(task.GetK8SPod().PodSpec, &podSpec) - if err != nil { - return v1.Pod{}, errors.Errorf(errors.BadTaskSpecification, - "Unable to unmarshal task custom [%v], Err: [%v]", task.GetCustom(), err.Error()) - } - primaryContainerName, ok := task.GetConfig()[primaryContainerKey] - if !ok { - return v1.Pod{}, errors.Errorf(errors.BadTaskSpecification, - "invalid TaskSpecification, config missing [%s] key in [%v]", primaryContainerKey, task.GetConfig()) - } - - var pod = v1.Pod{ - Spec: *podSpec, - } - if task.GetK8SPod().Metadata != nil { - if task.GetK8SPod().Metadata.Annotations != nil { - pod.Annotations = task.GetK8SPod().Metadata.Annotations - } - if task.GetK8SPod().Metadata.Labels != nil { - pod.Labels = task.GetK8SPod().Metadata.Labels - } - } - if len(pod.Annotations) == 0 { - pod.Annotations = make(map[string]string) - } - pod.Annotations[primaryContainerKey] = primaryContainerName - - // Set the restart policy to *not* inherit from the default so that a completed pod doesn't get caught in a - // CrashLoopBackoff after the initial job completion. - pod.Spec.RestartPolicy = v1.RestartPolicyNever - flytek8s.GetServiceAccountNameFromTaskExecutionMetadata(metadata) - return pod, nil -} - -// FlyteArrayJobToK8sPodTemplate returns a pod template for the given task context. Note that Name is not set on the -// result object. It's up to the caller to set the Name before creating the object in K8s. -func FlyteArrayJobToK8sPodTemplate(ctx context.Context, tCtx core.TaskExecutionContext, namespaceTemplate string) ( - podTemplate v1.Pod, job *idlPlugins.ArrayJob, err error) { - - // Check that the taskTemplate is valid - taskTemplate, err := tCtx.TaskReader().Read(ctx) - if err != nil { - return v1.Pod{}, nil, err - } else if taskTemplate == nil { - return v1.Pod{}, nil, errors.Errorf(errors.BadTaskSpecification, "Required value not set, taskTemplate is nil") - } - - if taskTemplate.GetContainer() == nil && taskTemplate.GetK8SPod() == nil { - return v1.Pod{}, nil, errors.Errorf(errors.BadTaskSpecification, - "Required value not set, taskTemplate Container or K8sPod") - } - - arrTCtx := &arrayTaskContext{ - TaskExecutionContext: tCtx, - arrayInputReader: array.GetInputReader(tCtx, taskTemplate), - } - - var arrayJob *idlPlugins.ArrayJob - if taskTemplate.GetCustom() != nil { - arrayJob, err = core2.ToArrayJob(taskTemplate.GetCustom(), taskTemplate.TaskTypeVersion) - if err != nil { - return v1.Pod{}, nil, err - } - } - - annotations := utils.UnionMaps(config.GetK8sPluginConfig().DefaultAnnotations, tCtx.TaskExecutionMetadata().GetAnnotations()) - labels := utils.UnionMaps(config.GetK8sPluginConfig().DefaultLabels, tCtx.TaskExecutionMetadata().GetLabels()) - - var pod = v1.Pod{ - TypeMeta: metav1.TypeMeta{ - Kind: PodKind, - APIVersion: v1.SchemeGroupVersion.String(), - }, - ObjectMeta: metav1.ObjectMeta{ - // Note that name is missing here - Namespace: GetNamespaceForExecution(tCtx, namespaceTemplate), - Labels: labels, - Annotations: annotations, - OwnerReferences: []metav1.OwnerReference{tCtx.TaskExecutionMetadata().GetOwnerReference()}, - }, - } - - if taskTemplate.GetContainer() != nil { - podSpec, err := flytek8s.ToK8sPodSpecWithInterruptible(ctx, arrTCtx, true) - if err != nil { - return v1.Pod{}, nil, err - } - - pod.Spec = *podSpec - } else if taskTemplate.GetK8SPod() != nil { - k8sPod, err := buildPodMapTask(taskTemplate, tCtx.TaskExecutionMetadata()) - if err != nil { - return v1.Pod{}, nil, err - } - - pod.Labels = utils.UnionMaps(pod.Labels, k8sPod.Labels) - pod.Annotations = utils.UnionMaps(pod.Annotations, k8sPod.Annotations) - pod.Spec = k8sPod.Spec - - containerIndex, err := getTaskContainerIndex(&pod) - if err != nil { - return v1.Pod{}, nil, err - } - - templateParameters := template.Parameters{ - TaskExecMetadata: tCtx.TaskExecutionMetadata(), - Inputs: arrTCtx.arrayInputReader, - OutputPath: tCtx.OutputWriter(), - Task: tCtx.TaskReader(), - } - - err = flytek8s.AddFlyteCustomizationsToContainer( - ctx, templateParameters, flytek8s.ResourceCustomizationModeMergeExistingResources, &pod.Spec.Containers[containerIndex]) - if err != nil { - return v1.Pod{}, nil, err - } - } - - return pod, arrayJob, nil -} diff --git a/go/tasks/plugins/array/k8s/transformer_test.go b/go/tasks/plugins/array/k8s/transformer_test.go deleted file mode 100644 index 822e3c544..000000000 --- a/go/tasks/plugins/array/k8s/transformer_test.go +++ /dev/null @@ -1,257 +0,0 @@ -package k8s - -import ( - "context" - "encoding/json" - "fmt" - "testing" - - "k8s.io/apimachinery/pkg/api/resource" - - "github.com/flyteorg/flytestdlib/storage" - - "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" - idlPlugins "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core/mocks" - mocks2 "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/io/mocks" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils" - structpb "github.com/golang/protobuf/ptypes/struct" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/mock" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - v12 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -const testPrimaryContainerName = "primary container" - -var podSpec = v1.PodSpec{ - Containers: []v1.Container{ - { - Name: testPrimaryContainerName, - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("1"), - v1.ResourceStorage: resource.MustParse("2"), - }, - }, - }, - { - Name: "secondary container", - }, - }, -} - -var arrayJob = idlPlugins.ArrayJob{ - Size: 100, -} - -func getK8sPodTask(t *testing.T, annotations map[string]string) *core.TaskTemplate { - marshalledPodspec, err := json.Marshal(podSpec) - if err != nil { - t.Fatal(err) - } - - structObj := &structpb.Struct{} - if err := json.Unmarshal(marshalledPodspec, structObj); err != nil { - t.Fatal(err) - } - - custom := &structpb.Struct{} - if err := utils.MarshalStruct(&arrayJob, custom); err != nil { - t.Fatal(err) - } - - return &core.TaskTemplate{ - TaskTypeVersion: 2, - Config: map[string]string{ - primaryContainerKey: testPrimaryContainerName, - }, - Target: &core.TaskTemplate_K8SPod{ - K8SPod: &core.K8SPod{ - PodSpec: structObj, - Metadata: &core.K8SObjectMetadata{ - Labels: map[string]string{ - "label": "foo", - }, - Annotations: annotations, - }, - }, - }, - Custom: custom, - } -} - -func TestBuildPodMapTask(t *testing.T) { - tMeta := &mocks.TaskExecutionMetadata{} - tMeta.OnGetSecurityContext().Return(core.SecurityContext{}) - tMeta.OnGetK8sServiceAccount().Return("sa") - pod, err := buildPodMapTask(getK8sPodTask(t, map[string]string{ - "anno": "bar", - }), tMeta) - assert.NoError(t, err) - var expected = podSpec.DeepCopy() - expected.RestartPolicy = v1.RestartPolicyNever - assert.EqualValues(t, *expected, pod.Spec) - assert.EqualValues(t, map[string]string{ - "label": "foo", - }, pod.Labels) - assert.EqualValues(t, map[string]string{ - "anno": "bar", - "primary_container_name": "primary container", - }, pod.Annotations) -} - -func TestBuildPodMapTask_Errors(t *testing.T) { - t.Run("invalid task template", func(t *testing.T) { - _, err := buildPodMapTask(&core.TaskTemplate{}, nil) - assert.EqualError(t, err, "[BadTaskSpecification] Missing pod spec for task") - }) - b, err := json.Marshal(podSpec) - if err != nil { - t.Fatal(err) - } - - structObj := &structpb.Struct{} - if err := json.Unmarshal(b, structObj); err != nil { - t.Fatal(err) - } - t.Run("missing primary container annotation", func(t *testing.T) { - _, err = buildPodMapTask(&core.TaskTemplate{ - Target: &core.TaskTemplate_K8SPod{ - K8SPod: &core.K8SPod{ - PodSpec: structObj, - }, - }, - }, nil) - assert.EqualError(t, err, "[BadTaskSpecification] invalid TaskSpecification, config missing [primary_container_name] key in [map[]]") - }) -} - -func TestBuildPodMapTask_AddAnnotations(t *testing.T) { - tMeta := &mocks.TaskExecutionMetadata{} - tMeta.OnGetSecurityContext().Return(core.SecurityContext{}) - tMeta.OnGetK8sServiceAccount().Return("sa") - podTask := getK8sPodTask(t, nil) - pod, err := buildPodMapTask(podTask, tMeta) - assert.NoError(t, err) - var expected = podSpec.DeepCopy() - expected.RestartPolicy = v1.RestartPolicyNever - assert.EqualValues(t, *expected, pod.Spec) - assert.EqualValues(t, map[string]string{ - "label": "foo", - }, pod.Labels) - assert.EqualValues(t, map[string]string{ - "primary_container_name": "primary container", - }, pod.Annotations) -} - -func TestFlyteArrayJobToK8sPodTemplate(t *testing.T) { - ctx := context.TODO() - tr := &mocks.TaskReader{} - tr.OnRead(ctx).Return(getK8sPodTask(t, map[string]string{ - "anno": "bar", - }), nil) - - ir := &mocks2.InputReader{} - ir.OnGetInputPrefixPath().Return("/prefix/") - ir.OnGetInputPath().Return("/prefix/inputs.pb") - ir.OnGetMatch(mock.Anything).Return(&core.LiteralMap{}, nil) - - tMeta := &mocks.TaskExecutionMetadata{} - tMeta.OnGetNamespace().Return("n") - tMeta.OnGetLabels().Return(map[string]string{ - "tCtx": "label", - }) - tMeta.OnGetAnnotations().Return(map[string]string{ - "tCtx": "anno", - }) - tMeta.OnGetOwnerReference().Return(v12.OwnerReference{}) - tMeta.OnGetSecurityContext().Return(core.SecurityContext{}) - tMeta.OnGetK8sServiceAccount().Return("sa") - mockResourceOverrides := mocks.TaskOverrides{} - mockResourceOverrides.OnGetResources().Return(&v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "ephemeral-storage": resource.MustParse("1024Mi"), - }, - Limits: v1.ResourceList{ - "ephemeral-storage": resource.MustParse("2048Mi"), - }, - }) - tMeta.OnGetOverrides().Return(&mockResourceOverrides) - tMeta.OnGetPlatformResources().Return(&v1.ResourceRequirements{}) - tID := &mocks.TaskExecutionID{} - tID.OnGetID().Return(core.TaskExecutionIdentifier{ - NodeExecutionId: &core.NodeExecutionIdentifier{ - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my_name", - Project: "my_project", - Domain: "my_domain", - }, - }, - RetryAttempt: 1, - }) - tMeta.OnGetTaskExecutionID().Return(tID) - - outputReader := &mocks2.OutputWriter{} - outputReader.On("GetOutputPath").Return(storage.DataReference("/data/outputs.pb")) - outputReader.On("GetOutputPrefixPath").Return(storage.DataReference("/data/")) - outputReader.On("GetRawOutputPrefix").Return(storage.DataReference("")) - - tCtx := &mocks.TaskExecutionContext{} - tCtx.OnTaskReader().Return(tr) - tCtx.OnInputReader().Return(ir) - tCtx.OnTaskExecutionMetadata().Return(tMeta) - tCtx.OnOutputWriter().Return(outputReader) - - pod, job, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx, "") - assert.NoError(t, err) - assert.EqualValues(t, metav1.ObjectMeta{ - Namespace: "n", - Labels: map[string]string{ - "tCtx": "label", - "label": "foo", - }, - Annotations: map[string]string{ - "tCtx": "anno", - "anno": "bar", - "primary_container_name": "primary container", - "cluster-autoscaler.kubernetes.io/safe-to-evict": "false", - }, - OwnerReferences: []metav1.OwnerReference{ - {}, - }, - }, pod.ObjectMeta) - assert.EqualValues(t, &arrayJob, job) - defaultMemoryFromConfig := resource.MustParse("1024Mi") - assert.EqualValues(t, v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("1"), - v1.ResourceMemory: defaultMemoryFromConfig, - v1.ResourceEphemeralStorage: resource.MustParse("1024Mi"), - }, - Limits: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("1"), - v1.ResourceMemory: defaultMemoryFromConfig, - v1.ResourceEphemeralStorage: resource.MustParse("2048Mi"), - }, - }, pod.Spec.Containers[0].Resources, fmt.Sprintf("%+v", pod.Spec.Containers[0].Resources)) - assert.EqualValues(t, []v1.EnvVar{ - { - Name: "FLYTE_INTERNAL_EXECUTION_ID", - Value: "my_name", - }, - { - Name: "FLYTE_INTERNAL_EXECUTION_PROJECT", - Value: "my_project", - }, - { - Name: "FLYTE_INTERNAL_EXECUTION_DOMAIN", - Value: "my_domain", - }, - { - Name: "FLYTE_ATTEMPT_NUMBER", - Value: "1", - }, - }, pod.Spec.Containers[0].Env) -} diff --git a/go/tasks/plugins/k8s/pod/container.go b/go/tasks/plugins/k8s/pod/container.go index 197ff502f..0ec8240ec 100644 --- a/go/tasks/plugins/k8s/pod/container.go +++ b/go/tasks/plugins/k8s/pod/container.go @@ -12,7 +12,7 @@ import ( ) const ( - containerTaskType = "container" + ContainerTaskType = "container" ) type containerPodBuilder struct { diff --git a/go/tasks/plugins/k8s/pod/plugin.go b/go/tasks/plugins/k8s/pod/plugin.go index 312759563..7ace35989 100644 --- a/go/tasks/plugins/k8s/pod/plugin.go +++ b/go/tasks/plugins/k8s/pod/plugin.go @@ -19,13 +19,15 @@ import ( const ( podTaskType = "pod" - primaryContainerKey = "primary_container_name" + PrimaryContainerKey = "primary_container_name" ) var ( - defaultPodBuilder = containerPodBuilder{} - podBuilders = map[string]podBuilder{ - sidecarTaskType: sidecarPodBuilder{}, + DefaultPodPlugin = plugin{ + defaultPodBuilder: containerPodBuilder{}, + podBuilders: map[string]podBuilder{ + SidecarTaskType: sidecarPodBuilder{}, + }, } ) @@ -104,7 +106,7 @@ func (plugin) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, return pluginsCore.PhaseInfoUndefined, nil } - primaryContainerName, exists := r.GetAnnotations()[primaryContainerKey] + primaryContainerName, exists := r.GetAnnotations()[PrimaryContainerKey] if !exists { // if the primary container annotation dos not exist, then the task requires all containers // to succeed to declare success. therefore, if the pod is not in one of the above states we @@ -128,43 +130,38 @@ func (plugin) GetProperties() k8s.PluginProperties { } func init() { - podPlugin := plugin{ - defaultPodBuilder: defaultPodBuilder, - podBuilders: podBuilders, - } - - // Register containerTaskType and sidecarTaskType plugin entries. These separate task types + // Register ContainerTaskType and SidecarTaskType plugin entries. These separate task types // still exist within the system, only now both are evaluated using the same internal pod plugin // instance. This simplifies migration as users may keep the same configuration but are // seamlessly transitioned from separate container and sidecar plugins to a single pod plugin. pluginmachinery.PluginRegistry().RegisterK8sPlugin( k8s.PluginEntry{ - ID: containerTaskType, - RegisteredTaskTypes: []pluginsCore.TaskType{containerTaskType}, + ID: ContainerTaskType, + RegisteredTaskTypes: []pluginsCore.TaskType{ContainerTaskType}, ResourceToWatch: &v1.Pod{}, - Plugin: podPlugin, + Plugin: DefaultPodPlugin, IsDefault: true, - DefaultForTaskTypes: []pluginsCore.TaskType{containerTaskType}, + DefaultForTaskTypes: []pluginsCore.TaskType{ContainerTaskType}, }) pluginmachinery.PluginRegistry().RegisterK8sPlugin( k8s.PluginEntry{ - ID: sidecarTaskType, - RegisteredTaskTypes: []pluginsCore.TaskType{sidecarTaskType}, + ID: SidecarTaskType, + RegisteredTaskTypes: []pluginsCore.TaskType{SidecarTaskType}, ResourceToWatch: &v1.Pod{}, - Plugin: podPlugin, + Plugin: DefaultPodPlugin, IsDefault: false, - DefaultForTaskTypes: []pluginsCore.TaskType{sidecarTaskType}, + DefaultForTaskTypes: []pluginsCore.TaskType{SidecarTaskType}, }) // register podTaskType plugin entry pluginmachinery.PluginRegistry().RegisterK8sPlugin( k8s.PluginEntry{ ID: podTaskType, - RegisteredTaskTypes: []pluginsCore.TaskType{containerTaskType, sidecarTaskType}, + RegisteredTaskTypes: []pluginsCore.TaskType{ContainerTaskType, SidecarTaskType}, ResourceToWatch: &v1.Pod{}, - Plugin: podPlugin, + Plugin: DefaultPodPlugin, IsDefault: true, - DefaultForTaskTypes: []pluginsCore.TaskType{containerTaskType, sidecarTaskType}, + DefaultForTaskTypes: []pluginsCore.TaskType{ContainerTaskType, SidecarTaskType}, }) } diff --git a/go/tasks/plugins/k8s/pod/sidecar.go b/go/tasks/plugins/k8s/pod/sidecar.go index ba208a8a8..7a8495258 100644 --- a/go/tasks/plugins/k8s/pod/sidecar.go +++ b/go/tasks/plugins/k8s/pod/sidecar.go @@ -15,7 +15,7 @@ import ( ) const ( - sidecarTaskType = "sidecar" + SidecarTaskType = "sidecar" ) // Why, you might wonder do we recreate the generated go struct generated from the plugins.SidecarJob proto? Because @@ -80,13 +80,13 @@ func (sidecarPodBuilder) buildPodSpec(ctx context.Context, task *core.TaskTempla func getPrimaryContainerNameFromConfig(task *core.TaskTemplate) (string, error) { if len(task.GetConfig()) == 0 { return "", errors.Errorf(errors.BadTaskSpecification, - "invalid TaskSpecification, config needs to be non-empty and include missing [%s] key", primaryContainerKey) + "invalid TaskSpecification, config needs to be non-empty and include missing [%s] key", PrimaryContainerKey) } - primaryContainerName, ok := task.GetConfig()[primaryContainerKey] + primaryContainerName, ok := task.GetConfig()[PrimaryContainerKey] if !ok { return "", errors.Errorf(errors.BadTaskSpecification, - "invalid TaskSpecification, config missing [%s] key in [%v]", primaryContainerKey, task.GetConfig()) + "invalid TaskSpecification, config missing [%s] key in [%v]", PrimaryContainerKey, task.GetConfig()) } return primaryContainerName, nil @@ -144,7 +144,7 @@ func (sidecarPodBuilder) updatePodMetadata(ctx context.Context, pod *v1.Pod, tas return err } - pod.Annotations[primaryContainerKey] = primaryContainerName + pod.Annotations[PrimaryContainerKey] = primaryContainerName return nil } From 13176d6c981c52c7d8396d196ad91995da4d7080 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Mon, 14 Feb 2022 10:24:20 -0600 Subject: [PATCH 02/20] initializing array log plugins Signed-off-by: Daniel Rammer --- go/tasks/logs/logging_utils.go | 7 +------ .../core/mocks/fake_k8s_client.go | 18 ++++++++++++++++-- go/tasks/plugins/array/k8s/management.go | 9 ++++++++- go/tasks/plugins/array/k8s/subtask.go | 9 ++++----- go/tasks/plugins/k8s/pod/plugin.go | 14 ++++++++++++-- 5 files changed, 41 insertions(+), 16 deletions(-) diff --git a/go/tasks/logs/logging_utils.go b/go/tasks/logs/logging_utils.go index f6b3e06db..805c6273c 100755 --- a/go/tasks/logs/logging_utils.go +++ b/go/tasks/logs/logging_utils.go @@ -18,12 +18,7 @@ type logPlugin struct { } // Internal -func GetLogsForContainerInPod(ctx context.Context, pod *v1.Pod, index uint32, nameSuffix string) ([]*core.TaskLog, error) { - logPlugin, err := InitializeLogPlugins(GetLogConfig()) - if err != nil { - return nil, err - } - +func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, pod *v1.Pod, index uint32, nameSuffix string) ([]*core.TaskLog, error) { if logPlugin == nil { return nil, nil } diff --git a/go/tasks/pluginmachinery/core/mocks/fake_k8s_client.go b/go/tasks/pluginmachinery/core/mocks/fake_k8s_client.go index 36ada0c9d..3d8278967 100644 --- a/go/tasks/pluginmachinery/core/mocks/fake_k8s_client.go +++ b/go/tasks/pluginmachinery/core/mocks/fake_k8s_client.go @@ -6,9 +6,9 @@ import ( "reflect" "sync" - "k8s.io/apimachinery/pkg/api/meta" - + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -89,6 +89,20 @@ func (m *FakeKubeClient) Create(ctx context.Context, obj client.Object, opts ... m.syncObj.Lock() defer m.syncObj.Unlock() + // if obj is a *v1.Pod then append a ContainerStatus for each Container + pod, ok := obj.(*v1.Pod) + if ok { + for i, _ := range pod.Spec.Containers { + if len(pod.Status.ContainerStatuses) > i { + continue + } + + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, v1.ContainerStatus{ + ContainerID: "docker://container-name", + }) + } + } + accessor, err := meta.Accessor(obj) if err != nil { return err diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index b43d66a1f..6c8141cf8 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -8,6 +8,7 @@ import ( idlCore "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" "github.com/flyteorg/flyteplugins/go/tasks/errors" + "github.com/flyteorg/flyteplugins/go/tasks/logs" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" "github.com/flyteorg/flyteplugins/go/tasks/plugins/array" "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/arraystatus" @@ -101,6 +102,12 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon currentState.RetryAttempts = retryAttemptsArray } + // initialize log plugin + logPlugin, err := logs.InitializeLogPlugins(&config.LogConfig.Config) + if err != nil { + return currentState, logLinks, subTaskIDs, err + } + // identify max parallelism taskTemplate, err := tCtx.TaskReader().Read(ctx) if err != nil { @@ -168,7 +175,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon } // monitor pod - phaseInfo, err := getSubtaskPhaseInfo(ctx, stCtx, kubeClient) + phaseInfo, err := getSubtaskPhaseInfo(ctx, stCtx, kubeClient, logPlugin) if err != nil { return currentState, logLinks, subTaskIDs, err } diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index 34c27994e..94cf4da81 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -8,11 +8,12 @@ import ( pluginsCore "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s/config" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils" podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" errors2 "github.com/flyteorg/flytestdlib/errors" - "github.com/flyteorg/flytestdlib/logger" + //"github.com/flyteorg/flytestdlib/logger" // TODO hamersaw - remove v1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -121,14 +122,13 @@ func finalizeSubtask() error { return nil } -func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, kubeClient pluginsCore.KubeClient) (pluginsCore.PhaseInfo, error) { +func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, kubeClient pluginsCore.KubeClient, logPlugin tasklog.Plugin) (pluginsCore.PhaseInfo, error) { o, err := podPlugin.DefaultPodPlugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) if err != nil { return pluginsCore.PhaseInfoUndefined, err } pod := o.(*v1.Pod) - logger.Infof(ctx, "BEFORE: %v", pod) addMetadata(stCtx, pod) // Attempt to get resource from informer cache, if not found, retrieve it from API server. @@ -151,8 +151,7 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, kub return pluginsCore.PhaseInfoUndefined, err } - logger.Infof(ctx, "AFTER: %v", pod) - return podPlugin.DefaultPodPlugin.GetTaskPhase(ctx, stCtx, pod) + return podPlugin.DefaultPodPlugin.GetTaskPhaseWithLogPlugin(ctx, stCtx, pod, logPlugin) } func getTaskContainerIndex(pod *v1.Pod) (int, error) { diff --git a/go/tasks/plugins/k8s/pod/plugin.go b/go/tasks/plugins/k8s/pod/plugin.go index 7ace35989..04cf8f702 100644 --- a/go/tasks/plugins/k8s/pod/plugin.go +++ b/go/tasks/plugins/k8s/pod/plugin.go @@ -11,6 +11,7 @@ import ( pluginsCore "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/k8s" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog" v1 "k8s.io/api/core/v1" @@ -77,7 +78,16 @@ func (p plugin) BuildResource(ctx context.Context, taskCtx pluginsCore.TaskExecu return pod, nil } -func (plugin) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, r client.Object) (pluginsCore.PhaseInfo, error) { +func (p plugin) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, r client.Object) (pluginsCore.PhaseInfo, error) { + logPlugin, err := logs.InitializeLogPlugins(logs.GetLogConfig()) + if err != nil { + return pluginsCore.PhaseInfoUndefined, err + } + + return p.GetTaskPhaseWithLogPlugin(ctx, pluginContext, r, logPlugin) +} + +func (plugin) GetTaskPhaseWithLogPlugin(ctx context.Context, pluginContext k8s.PluginContext, r client.Object, logPlugin tasklog.Plugin) (pluginsCore.PhaseInfo, error) { pod := r.(*v1.Pod) transitionOccurredAt := flytek8s.GetLastTransitionOccurredAt(pod).Time @@ -86,7 +96,7 @@ func (plugin) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, } if pod.Status.Phase != v1.PodPending && pod.Status.Phase != v1.PodUnknown { - taskLogs, err := logs.GetLogsForContainerInPod(ctx, pod, 0, " (User)") + taskLogs, err := logs.GetLogsForContainerInPod(ctx, logPlugin, pod, 0, " (User)") if err != nil { return pluginsCore.PhaseInfoUndefined, err } From aeed667908ac7901064e673340bb12cfbbd7c202 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Mon, 14 Feb 2022 12:39:31 -0600 Subject: [PATCH 03/20] fixed namespace template tests Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 4 ++-- go/tasks/plugins/array/k8s/subtask.go | 17 ++++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index 6c8141cf8..fd21b1dfc 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -155,7 +155,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon } // create subtask - launchSubtask(ctx, stCtx, kubeClient) + launchSubtask(ctx, stCtx, config, kubeClient) if err != nil && !k8serrors.IsAlreadyExists(err) { if k8serrors.IsForbidden(err) { if strings.Contains(err.Error(), "exceeded quota") { @@ -175,7 +175,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon } // monitor pod - phaseInfo, err := getSubtaskPhaseInfo(ctx, stCtx, kubeClient, logPlugin) + phaseInfo, err := getSubtaskPhaseInfo(ctx, stCtx, config, kubeClient, logPlugin) if err != nil { return currentState, logLinks, subTaskIDs, err } diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index 94cf4da81..a29529edc 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -13,7 +13,7 @@ import ( podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" errors2 "github.com/flyteorg/flytestdlib/errors" - //"github.com/flyteorg/flytestdlib/logger" // TODO hamersaw - remove + "github.com/flyteorg/flytestdlib/logger" // TODO hamersaw - remove v1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -39,8 +39,7 @@ var ( namespaceRegex = regexp.MustCompile("(?i){{.namespace}}(?i)") ) -func addMetadata(stCtx SubTaskExecutionContext, pod *v1.Pod) { - cfg := GetConfig() +func addMetadata(stCtx SubTaskExecutionContext, cfg *Config, pod *v1.Pod) { k8sPluginCfg := config.GetK8sPluginConfig() taskExecutionMetadata := stCtx.TaskExecutionMetadata() @@ -59,6 +58,8 @@ func addMetadata(stCtx SubTaskExecutionContext, pod *v1.Pod) { pod.SetLabels(utils.UnionMaps(pod.GetLabels(), utils.CopyMap(taskExecutionMetadata.GetLabels()), k8sPluginCfg.DefaultLabels)) pod.SetName(taskExecutionMetadata.GetTaskExecutionID().GetGeneratedName()) + logger.Infof(context.Background(),"NAME: %s", pod.GetName()) + if !cfg.RemoteClusterConfig.Enabled { pod.OwnerReferences = []metav1.OwnerReference{taskExecutionMetadata.GetOwnerReference()} } @@ -86,14 +87,14 @@ func abortSubtask() error { return nil } -func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, kubeClient pluginsCore.KubeClient) error { +func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, config *Config, kubeClient pluginsCore.KubeClient) error { o, err := podPlugin.DefaultPodPlugin.BuildResource(ctx, stCtx) pod := o.(*v1.Pod) if err != nil { return err } - addMetadata(stCtx, pod) + addMetadata(stCtx, config, pod) // inject maptask specific container environment variables if len(pod.Spec.Containers) == 0 { @@ -122,14 +123,16 @@ func finalizeSubtask() error { return nil } -func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, kubeClient pluginsCore.KubeClient, logPlugin tasklog.Plugin) (pluginsCore.PhaseInfo, error) { +func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, config *Config, kubeClient pluginsCore.KubeClient, logPlugin tasklog.Plugin) (pluginsCore.PhaseInfo, error) { o, err := podPlugin.DefaultPodPlugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) if err != nil { return pluginsCore.PhaseInfoUndefined, err } pod := o.(*v1.Pod) - addMetadata(stCtx, pod) + addMetadata(stCtx, config, pod) + + logger.Infof(ctx, "POD: %v", pod) // Attempt to get resource from informer cache, if not found, retrieve it from API server. nsName := k8stypes.NamespacedName{Namespace: pod.GetNamespace(), Name: pod.GetName()} From aafa66edbc01c9e578b2bc6b22a7e7c28b32a5ce Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Mon, 14 Feb 2022 12:53:24 -0600 Subject: [PATCH 04/20] fixed podname for logs Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 1 + go/tasks/plugins/array/k8s/subtask.go | 5 ----- go/tasks/plugins/array/k8s/subtask_exec_context.go | 6 +++++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index fd21b1dfc..e07cc0e4b 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -138,6 +138,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon originalIdx := arrayCore.CalculateOriginalIndex(childIdx, newState.GetIndexesToCache()) stCtx := newSubTaskExecutionContext(tCtx, taskTemplate, originalIdx, retryAttempt) podName := stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName() + logger.Infof(ctx, "PODNAME: %v", podName) if existingPhase == core.PhaseUndefined || existingPhase == core.PhaseWaitingForResources || existingPhase == core.PhaseRetryableFailure { // attempt to allocateResource diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index a29529edc..a399fdad8 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -13,7 +13,6 @@ import ( podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" errors2 "github.com/flyteorg/flytestdlib/errors" - "github.com/flyteorg/flytestdlib/logger" // TODO hamersaw - remove v1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -58,8 +57,6 @@ func addMetadata(stCtx SubTaskExecutionContext, cfg *Config, pod *v1.Pod) { pod.SetLabels(utils.UnionMaps(pod.GetLabels(), utils.CopyMap(taskExecutionMetadata.GetLabels()), k8sPluginCfg.DefaultLabels)) pod.SetName(taskExecutionMetadata.GetTaskExecutionID().GetGeneratedName()) - logger.Infof(context.Background(),"NAME: %s", pod.GetName()) - if !cfg.RemoteClusterConfig.Enabled { pod.OwnerReferences = []metav1.OwnerReference{taskExecutionMetadata.GetOwnerReference()} } @@ -132,8 +129,6 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con pod := o.(*v1.Pod) addMetadata(stCtx, config, pod) - logger.Infof(ctx, "POD: %v", pod) - // Attempt to get resource from informer cache, if not found, retrieve it from API server. nsName := k8stypes.NamespacedName{Namespace: pod.GetNamespace(), Name: pod.GetName()} if err := kubeClient.GetClient().Get(ctx, nsName, pod); err != nil { diff --git a/go/tasks/plugins/array/k8s/subtask_exec_context.go b/go/tasks/plugins/array/k8s/subtask_exec_context.go index 85df5d00c..7318851c1 100644 --- a/go/tasks/plugins/array/k8s/subtask_exec_context.go +++ b/go/tasks/plugins/array/k8s/subtask_exec_context.go @@ -39,6 +39,7 @@ func (s SubTaskExecutionContext) TaskReader() pluginsCore.TaskReader { func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTemplate *core.TaskTemplate, originalIndex int, retryAttempt uint64) SubTaskExecutionContext { arrayInputReader := array.GetInputReader(tCtx, taskTemplate) + //metadataOverride := tCtx.TaskExecutionMetadata() taskExecutionMetadata := tCtx.TaskExecutionMetadata() taskExecutionID := taskExecutionMetadata.GetTaskExecutionID() metadataOverride := SubTaskExecutionMetadata{ @@ -85,7 +86,6 @@ func (s SubTaskReader) Read(ctx context.Context) (*core.TaskTemplate, error) { return s.subtaskTemplate, nil } -//s.stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName() type SubTaskExecutionID struct { pluginsCore.TaskExecutionID originalIndex int @@ -117,6 +117,10 @@ type SubTaskExecutionMetadata struct { //labels map[string]string } +func (s SubTaskExecutionMetadata) GetTaskExecutionID() pluginsCore.TaskExecutionID { + return s.subtaskExecutionID +} + /*func (t TaskExecutionMetadata) GetLabels() map[string]string { return t.labels } From b2fe8f5bb7be856fe2922e84bb49c29e23f1737d Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Mon, 14 Feb 2022 13:17:20 -0600 Subject: [PATCH 05/20] setting task log names based on retry attempt and index Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/subtask.go | 10 +++++++++- go/tasks/plugins/array/k8s/subtask_exec_context.go | 9 +++++++++ go/tasks/plugins/k8s/pod/plugin.go | 6 +++--- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index a399fdad8..fe4129bce 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -149,7 +149,15 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con return pluginsCore.PhaseInfoUndefined, err } - return podPlugin.DefaultPodPlugin.GetTaskPhaseWithLogPlugin(ctx, stCtx, pod, logPlugin) + /*var logName string + if subtaskRetryAttempt == 0 { + logName = fmt.Sprintf(" #%d-%d", retryAttempt, index) + } else { + logName = fmt.Sprintf(" #%d-%d-%d", retryAttempt, index, subtaskRetryAttempt) + }*/ + + stID, _ := stCtx.TaskExecutionMetadata().GetTaskExecutionID().(SubTaskExecutionID) + return podPlugin.DefaultPodPlugin.GetTaskPhaseWithLogs(ctx, stCtx, pod, logPlugin, stID.GetLogSuffix()) } func getTaskContainerIndex(pod *v1.Pod) (int, error) { diff --git a/go/tasks/plugins/array/k8s/subtask_exec_context.go b/go/tasks/plugins/array/k8s/subtask_exec_context.go index 7318851c1..2ca674f19 100644 --- a/go/tasks/plugins/array/k8s/subtask_exec_context.go +++ b/go/tasks/plugins/array/k8s/subtask_exec_context.go @@ -106,6 +106,15 @@ func (s SubTaskExecutionID) GetGeneratedName() string { return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v-%v", s.parentName, indexStr, retryAttemptStr)) } +func (s SubTaskExecutionID) GetLogSuffix() string { + // If retryAttempt is 0 we do not include it in the log suffix to match the pod name. + if s.retryAttempt == 0{ + return fmt.Sprintf(" #%d", s.originalIndex) + } + + return fmt.Sprintf(" #%d-%d", s.originalIndex, s.retryAttempt) +} + // TODO hamersaw - enable secrets // TaskExecutionMetadata provides a layer on top of the core TaskExecutionMetadata with customized annotations and labels // for k8s plugins. diff --git a/go/tasks/plugins/k8s/pod/plugin.go b/go/tasks/plugins/k8s/pod/plugin.go index 04cf8f702..5c1f22e66 100644 --- a/go/tasks/plugins/k8s/pod/plugin.go +++ b/go/tasks/plugins/k8s/pod/plugin.go @@ -84,10 +84,10 @@ func (p plugin) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContex return pluginsCore.PhaseInfoUndefined, err } - return p.GetTaskPhaseWithLogPlugin(ctx, pluginContext, r, logPlugin) + return p.GetTaskPhaseWithLogs(ctx, pluginContext, r, logPlugin, " (User)") } -func (plugin) GetTaskPhaseWithLogPlugin(ctx context.Context, pluginContext k8s.PluginContext, r client.Object, logPlugin tasklog.Plugin) (pluginsCore.PhaseInfo, error) { +func (plugin) GetTaskPhaseWithLogs(ctx context.Context, pluginContext k8s.PluginContext, r client.Object, logPlugin tasklog.Plugin, logSuffix string) (pluginsCore.PhaseInfo, error) { pod := r.(*v1.Pod) transitionOccurredAt := flytek8s.GetLastTransitionOccurredAt(pod).Time @@ -96,7 +96,7 @@ func (plugin) GetTaskPhaseWithLogPlugin(ctx context.Context, pluginContext k8s.P } if pod.Status.Phase != v1.PodPending && pod.Status.Phase != v1.PodUnknown { - taskLogs, err := logs.GetLogsForContainerInPod(ctx, logPlugin, pod, 0, " (User)") + taskLogs, err := logs.GetLogsForContainerInPod(ctx, logPlugin, pod, 0, logSuffix) if err != nil { return pluginsCore.PhaseInfoUndefined, err } From 60c10b2e4d7ae1f2fc6aaa8da0a58cdc9212ed59 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Mon, 14 Feb 2022 13:28:02 -0600 Subject: [PATCH 06/20] setting task log names correctly Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/subtask.go | 22 ++++++++++++------- .../plugins/array/k8s/subtask_exec_context.go | 16 +++++++++----- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index fe4129bce..59cae093b 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -149,15 +149,21 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con return pluginsCore.PhaseInfoUndefined, err } - /*var logName string - if subtaskRetryAttempt == 0 { - logName = fmt.Sprintf(" #%d-%d", retryAttempt, index) - } else { - logName = fmt.Sprintf(" #%d-%d-%d", retryAttempt, index, subtaskRetryAttempt) - }*/ - stID, _ := stCtx.TaskExecutionMetadata().GetTaskExecutionID().(SubTaskExecutionID) - return podPlugin.DefaultPodPlugin.GetTaskPhaseWithLogs(ctx, stCtx, pod, logPlugin, stID.GetLogSuffix()) + phaseInfo, err := podPlugin.DefaultPodPlugin.GetTaskPhaseWithLogs(ctx, stCtx, pod, logPlugin, stID.GetLogSuffix()) + if err != nil { + return pluginsCore.PhaseInfoUndefined, err + } + + if phaseInfo.Info() != nil { + // Append sub-job status in Log Name for viz. + for _, log := range phaseInfo.Info().Logs { + log.Name += fmt.Sprintf(" (%s)", phaseInfo.Phase().String()) + } + } + + + return phaseInfo, err } func getTaskContainerIndex(pod *v1.Pod) (int, error) { diff --git a/go/tasks/plugins/array/k8s/subtask_exec_context.go b/go/tasks/plugins/array/k8s/subtask_exec_context.go index 2ca674f19..dbc0ae97e 100644 --- a/go/tasks/plugins/array/k8s/subtask_exec_context.go +++ b/go/tasks/plugins/array/k8s/subtask_exec_context.go @@ -107,12 +107,16 @@ func (s SubTaskExecutionID) GetGeneratedName() string { } func (s SubTaskExecutionID) GetLogSuffix() string { - // If retryAttempt is 0 we do not include it in the log suffix to match the pod name. - if s.retryAttempt == 0{ - return fmt.Sprintf(" #%d", s.originalIndex) - } - - return fmt.Sprintf(" #%d-%d", s.originalIndex, s.retryAttempt) + return fmt.Sprintf(" #%d-%d", s.retryAttempt, s.originalIndex) + + // TODO - I don't think this is correct - + // should be originalIndex-retryAttempt [however](https://github.com/flyteorg/flyteplugins/pull/186#discussion_r666569825) + // but [this](https://github.com/flyteorg/flyteplugins/blob/b671abcba2f67cff5610bb9050ee75762dba3d03/go/tasks/plugins/array/k8s/task.go#L183) + /* + synopsis - the GetGeneratedName uses the taskExecutionContext retryAttempt to compute a name (ex. pod-0) + before tracking subtask retry attempts the pod name was this retry attempt with the index (ex. pod-0-0 or pod-1-0) for retry attempt 0 and 1 of index 0 + now we track subtask retry attempts individually so the pod name could be (pod-0-1-0) we're leaving this for now - but might want to change in the fugure + */ } // TODO hamersaw - enable secrets From c1b185a0afdffbaaf01e020ceb9a4ce49c2000d1 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Tue, 15 Feb 2022 16:03:24 -0600 Subject: [PATCH 07/20] filled out management unit tests Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 6 +- go/tasks/plugins/array/k8s/management_test.go | 566 ++++++++++-------- go/tasks/plugins/array/k8s/subtask.go | 9 +- .../plugins/array/k8s/subtask_exec_context.go | 12 +- 4 files changed, 329 insertions(+), 264 deletions(-) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index e07cc0e4b..070edf78f 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -132,13 +132,13 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon retryAttempt++ newState.RetryAttempts.SetItem(childIdx, retryAttempt) } else if existingPhase.IsTerminal() { + newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(existingPhase)) continue } originalIdx := arrayCore.CalculateOriginalIndex(childIdx, newState.GetIndexesToCache()) - stCtx := newSubTaskExecutionContext(tCtx, taskTemplate, originalIdx, retryAttempt) + stCtx := newSubTaskExecutionContext(tCtx, taskTemplate, childIdx, originalIdx, retryAttempt) podName := stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName() - logger.Infof(ctx, "PODNAME: %v", podName) if existingPhase == core.PhaseUndefined || existingPhase == core.PhaseWaitingForResources || existingPhase == core.PhaseRetryableFailure { // attempt to allocateResource @@ -213,6 +213,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) return currentState, logLinks, subTaskIDs, err } + + // TODO - finalize and delete resource? } // validate parallelism diff --git a/go/tasks/plugins/array/k8s/management_test.go b/go/tasks/plugins/array/k8s/management_test.go index 3311be8d3..d76f6c60d 100644 --- a/go/tasks/plugins/array/k8s/management_test.go +++ b/go/tasks/plugins/array/k8s/management_test.go @@ -4,27 +4,29 @@ import ( "fmt" "testing" - "github.com/flyteorg/flyteplugins/go/tasks/logs" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/workqueue" - core2 "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" + + "github.com/flyteorg/flyteplugins/go/tasks/logs" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core/mocks" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s" mocks2 "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/io/mocks" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/workqueue" "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/arraystatus" - "github.com/flyteorg/flytestdlib/bitarray" - "github.com/stretchr/testify/mock" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - v12 "k8s.io/apimachinery/pkg/apis/meta/v1" - arrayCore "github.com/flyteorg/flyteplugins/go/tasks/plugins/array/core" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core/mocks" + "github.com/flyteorg/flytestdlib/bitarray" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "golang.org/x/net/context" structpb "google.golang.org/protobuf/types/known/structpb" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func createSampleContainerTask() *core2.Container { @@ -87,7 +89,7 @@ func getMockTaskExecutionContext(ctx context.Context, parallelism int) *mocks.Ta tMeta.OnGetNamespace().Return("n") tMeta.OnGetLabels().Return(nil) tMeta.OnGetAnnotations().Return(nil) - tMeta.OnGetOwnerReference().Return(v12.OwnerReference{}) + tMeta.OnGetOwnerReference().Return(metav1.OwnerReference{}) tMeta.OnGetPlatformResources().Return(&v1.ResourceRequirements{}) ow := &mocks2.OutputWriter{} @@ -109,330 +111,392 @@ func getMockTaskExecutionContext(ctx context.Context, parallelism int) *mocks.Ta return tCtx } -/*func TestGetNamespaceForExecution(t *testing.T) { +func TestCheckSubTasksState(t *testing.T) { ctx := context.Background() - tCtx := getMockTaskExecutionContext(ctx, 0) - - assert.Equal(t, GetNamespaceForExecution(tCtx, ""), tCtx.TaskExecutionMetadata().GetNamespace()) - assert.Equal(t, GetNamespaceForExecution(tCtx, "abcd"), "abcd") - assert.Equal(t, GetNamespaceForExecution(tCtx, "a-{{.namespace}}-b"), fmt.Sprintf("a-%s-b", tCtx.TaskExecutionMetadata().GetNamespace())) -}*/ - -func testSubTaskIDs(t *testing.T, actual []*string) { - var expected = make([]*string, 5) - for i := 0; i < len(expected); i++ { - subTaskID := fmt.Sprintf("notfound-%d", i) - expected[i] = &subTaskID + subtaskCount := 5 + + config := Config{ + MaxArrayJobSize: int64(subtaskCount * 10), + ResourceConfig: ResourceConfig{ + PrimaryLabel: "p", + Limit: subtaskCount, + }, } - assert.EqualValues(t, expected, actual) -} -func TestCheckSubTasksState(t *testing.T) { - ctx := context.Background() + fakeKubeClient := mocks.NewFakeKubeClient() + fakeKubeCache := mocks.NewFakeKubeCache() - tCtx := getMockTaskExecutionContext(ctx, 0) - kubeClient := mocks.KubeClient{} - kubeClient.OnGetClient().Return(mocks.NewFakeKubeClient()) - kubeClient.OnGetCache().Return(mocks.NewFakeKubeCache()) + for i:=0; i 0 { pod.Spec.SchedulerName = cfg.DefaultScheduler } - + // TODO - should these be appends? or left as overrides? if len(cfg.NodeSelector) != 0 { pod.Spec.NodeSelector = cfg.NodeSelector @@ -130,7 +132,7 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con addMetadata(stCtx, config, pod) // Attempt to get resource from informer cache, if not found, retrieve it from API server. - nsName := k8stypes.NamespacedName{Namespace: pod.GetNamespace(), Name: pod.GetName()} + nsName := k8stypes.NamespacedName{Name: pod.GetName(), Namespace: pod.GetNamespace()} if err := kubeClient.GetClient().Get(ctx, nsName, pod); err != nil { if isK8sObjectNotExists(err) { // This happens sometimes because a node gets removed and K8s deletes the pod. This will result in a @@ -141,7 +143,7 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con // TODO - validate? // return pluginsCore.PhaseInfoUndefined, err? - return pluginsCore.PhaseInfoSystemRetryableFailure("ResourceDeletedExternally", failureReason, nil), err + return pluginsCore.PhaseInfoSystemRetryableFailure("ResourceDeletedExternally", failureReason, nil), nil } //logger.Warningf(ctx, "Failed to retrieve Resource Details with name: %v. Error: %v", nsName, err) @@ -162,7 +164,6 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con } } - return phaseInfo, err } diff --git a/go/tasks/plugins/array/k8s/subtask_exec_context.go b/go/tasks/plugins/array/k8s/subtask_exec_context.go index dbc0ae97e..ed1aeebae 100644 --- a/go/tasks/plugins/array/k8s/subtask_exec_context.go +++ b/go/tasks/plugins/array/k8s/subtask_exec_context.go @@ -20,7 +20,6 @@ type SubTaskExecutionContext struct { arrayInputReader io.InputReader metadataOverride pluginsCore.TaskExecutionMetadata originalIndex int - retryAttempt uint64 subtaskReader SubTaskReader } @@ -37,7 +36,7 @@ func (s SubTaskExecutionContext) TaskReader() pluginsCore.TaskReader { return s.subtaskReader } -func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTemplate *core.TaskTemplate, originalIndex int, retryAttempt uint64) SubTaskExecutionContext { +func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTemplate *core.TaskTemplate, index, originalIndex int, retryAttempt uint64) SubTaskExecutionContext { arrayInputReader := array.GetInputReader(tCtx, taskTemplate) //metadataOverride := tCtx.TaskExecutionMetadata() taskExecutionMetadata := tCtx.TaskExecutionMetadata() @@ -46,7 +45,7 @@ func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTempl taskExecutionMetadata, SubTaskExecutionID{ taskExecutionID, - originalIndex, + index, taskExecutionID.GetGeneratedName(), retryAttempt, }, @@ -72,7 +71,6 @@ func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTempl arrayInputReader: arrayInputReader, metadataOverride: metadataOverride, originalIndex: originalIndex, - retryAttempt: retryAttempt, subtaskReader: subtaskReader, } } @@ -88,13 +86,13 @@ func (s SubTaskReader) Read(ctx context.Context) (*core.TaskTemplate, error) { type SubTaskExecutionID struct { pluginsCore.TaskExecutionID - originalIndex int + index int parentName string retryAttempt uint64 } func (s SubTaskExecutionID) GetGeneratedName() string { - indexStr := strconv.Itoa(s.originalIndex) + indexStr := strconv.Itoa(s.index) // If the retryAttempt is 0 we do not include it in the pod name. The gives us backwards // compatibility in the ability to dynamically transition running map tasks to use subtask retries. @@ -107,7 +105,7 @@ func (s SubTaskExecutionID) GetGeneratedName() string { } func (s SubTaskExecutionID) GetLogSuffix() string { - return fmt.Sprintf(" #%d-%d", s.retryAttempt, s.originalIndex) + return fmt.Sprintf(" #%d-%d", s.retryAttempt, s.index) // TODO - I don't think this is correct - // should be originalIndex-retryAttempt [however](https://github.com/flyteorg/flyteplugins/pull/186#discussion_r666569825) From 9c1d9389d128fdce31dfe13bd4322bed34ecc893 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Tue, 15 Feb 2022 21:36:50 -0600 Subject: [PATCH 08/20] cleaned up subtask execution context and added unit tests Signed-off-by: Daniel Rammer --- .../plugins/array/k8s/integration_test.go | 26 +++--- .../plugins/array/k8s/subtask_exec_context.go | 79 ++++++------------- .../array/k8s/subtask_exec_context_test.go | 32 ++++++++ 3 files changed, 66 insertions(+), 71 deletions(-) create mode 100644 go/tasks/plugins/array/k8s/subtask_exec_context_test.go diff --git a/go/tasks/plugins/array/k8s/integration_test.go b/go/tasks/plugins/array/k8s/integration_test.go index 9b76265c5..17edf2077 100644 --- a/go/tasks/plugins/array/k8s/integration_test.go +++ b/go/tasks/plugins/array/k8s/integration_test.go @@ -1,33 +1,29 @@ package k8s import ( + "context" "strconv" "testing" - "github.com/flyteorg/flyteplugins/go/tasks/plugins/array" - - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/io" - - "github.com/flyteorg/flytestdlib/storage" + "github.com/flyteorg/flyteidl/clients/go/coreutils" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" - - "context" - - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" - + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core/mocks" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/io" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/workqueue" + "github.com/flyteorg/flyteplugins/go/tasks/plugins/array" "github.com/flyteorg/flytestdlib/contextutils" + "github.com/flyteorg/flytestdlib/promutils" "github.com/flyteorg/flytestdlib/promutils/labeled" + "github.com/flyteorg/flytestdlib/storage" - "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core/mocks" - "github.com/flyteorg/flytestdlib/promutils" "github.com/stretchr/testify/assert" - "github.com/flyteorg/flyteidl/clients/go/coreutils" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "sigs.k8s.io/controller-runtime/pkg/client" ) func init() { diff --git a/go/tasks/plugins/array/k8s/subtask_exec_context.go b/go/tasks/plugins/array/k8s/subtask_exec_context.go index ed1aeebae..4ad01a676 100644 --- a/go/tasks/plugins/array/k8s/subtask_exec_context.go +++ b/go/tasks/plugins/array/k8s/subtask_exec_context.go @@ -14,7 +14,8 @@ import ( podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" ) -// TaskExecutionContext provides a layer on top of core TaskExecutionContext with a custom TaskExecutionMetadata. +// SubTaskExecutionContext wraps the core TaskExecutionContext so that the k8s array task context +// can be used within the pod plugin type SubTaskExecutionContext struct { pluginsCore.TaskExecutionContext arrayInputReader io.InputReader @@ -23,36 +24,39 @@ type SubTaskExecutionContext struct { subtaskReader SubTaskReader } -// InputReader overrides the TaskExecutionContext from base and returns a specialized context for Array +// InputReader overrides the base TaskExecutionContext to return a custom InputReader func (s SubTaskExecutionContext) InputReader() io.InputReader { return s.arrayInputReader } +// TaskExecutionMetadata overrides the base TaskExecutionContext to return custom +// TaskExecutionMetadata func (s SubTaskExecutionContext) TaskExecutionMetadata() pluginsCore.TaskExecutionMetadata { return s.metadataOverride } +// TaskReader overrides the base TaskExecutionContext to return a custom TaskReader func (s SubTaskExecutionContext) TaskReader() pluginsCore.TaskReader { return s.subtaskReader } -func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTemplate *core.TaskTemplate, index, originalIndex int, retryAttempt uint64) SubTaskExecutionContext { +// newSubtaskExecutionContext constructs a SubTaskExecutionContext using the provided parameters +func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTemplate *core.TaskTemplate, + executionIndex, originalIndex int, retryAttempt uint64) SubTaskExecutionContext { arrayInputReader := array.GetInputReader(tCtx, taskTemplate) - //metadataOverride := tCtx.TaskExecutionMetadata() taskExecutionMetadata := tCtx.TaskExecutionMetadata() taskExecutionID := taskExecutionMetadata.GetTaskExecutionID() metadataOverride := SubTaskExecutionMetadata{ taskExecutionMetadata, SubTaskExecutionID{ taskExecutionID, - index, + executionIndex, taskExecutionID.GetGeneratedName(), retryAttempt, }, } subtaskTemplate := &core.TaskTemplate{} - //var subtaskTemplate *core.TaskTemplate *subtaskTemplate = *taskTemplate if subtaskTemplate != nil { @@ -75,24 +79,28 @@ func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTempl } } +// SubTaskReader wraps the core TaskReader to customize the task template task type and version type SubTaskReader struct { pluginsCore.TaskReader subtaskTemplate *core.TaskTemplate } +// Read overrides the base TaskReader to return a custom TaskTemplate func (s SubTaskReader) Read(ctx context.Context) (*core.TaskTemplate, error) { return s.subtaskTemplate, nil } +// SubTaskExecutionID wraps the core TaskExecutionID to customize the generated pod name type SubTaskExecutionID struct { pluginsCore.TaskExecutionID - index int + executionIndex int parentName string retryAttempt uint64 } +// GetGeneratedName overrides the base TaskExecutionID to append the subtask index and retryAttempt func (s SubTaskExecutionID) GetGeneratedName() string { - indexStr := strconv.Itoa(s.index) + indexStr := strconv.Itoa(s.executionIndex) // If the retryAttempt is 0 we do not include it in the pod name. The gives us backwards // compatibility in the ability to dynamically transition running map tasks to use subtask retries. @@ -104,62 +112,21 @@ func (s SubTaskExecutionID) GetGeneratedName() string { return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v-%v", s.parentName, indexStr, retryAttemptStr)) } +// GetLogSuffix returns the suffix which should be appended to subtask log names func (s SubTaskExecutionID) GetLogSuffix() string { - return fmt.Sprintf(" #%d-%d", s.retryAttempt, s.index) - - // TODO - I don't think this is correct - - // should be originalIndex-retryAttempt [however](https://github.com/flyteorg/flyteplugins/pull/186#discussion_r666569825) - // but [this](https://github.com/flyteorg/flyteplugins/blob/b671abcba2f67cff5610bb9050ee75762dba3d03/go/tasks/plugins/array/k8s/task.go#L183) - /* - synopsis - the GetGeneratedName uses the taskExecutionContext retryAttempt to compute a name (ex. pod-0) - before tracking subtask retry attempts the pod name was this retry attempt with the index (ex. pod-0-0 or pod-1-0) for retry attempt 0 and 1 of index 0 - now we track subtask retry attempts individually so the pod name could be (pod-0-1-0) we're leaving this for now - but might want to change in the fugure - */ + // Append the retry attempt and executionIndex so that log names coincide with pod names per + // https://github.com/flyteorg/flyteplugins/pull/186#discussion_r666569825. Prior to tracking + // subtask retryAttempts the pod name used the map task retry number. We may want to revisit. + return fmt.Sprintf(" #%d-%d", s.retryAttempt, s.executionIndex) } -// TODO hamersaw - enable secrets -// TaskExecutionMetadata provides a layer on top of the core TaskExecutionMetadata with customized annotations and labels -// for k8s plugins. +// SubTaskExecutionMetadata wraps the core TaskExecutionMetadata to customize the TaskExecutionID type SubTaskExecutionMetadata struct { pluginsCore.TaskExecutionMetadata - subtaskExecutionID SubTaskExecutionID - //annotations map[string]string - //labels map[string]string } +// GetTaskExecutionID overrides the base TaskExecutionMetadata to return a custom TaskExecutionID func (s SubTaskExecutionMetadata) GetTaskExecutionID() pluginsCore.TaskExecutionID { return s.subtaskExecutionID } - -/*func (t TaskExecutionMetadata) GetLabels() map[string]string { - return t.labels -} - -func (t TaskExecutionMetadata) GetAnnotations() map[string]string { - return t.annotations -} - -// newTaskExecutionMetadata creates a TaskExecutionMetadata with secrets serialized as annotations and a label added -// to trigger the flyte pod webhook -func newTaskExecutionMetadata(tCtx pluginsCore.TaskExecutionMetadata, taskTmpl *core.TaskTemplate) (TaskExecutionMetadata, error) { - var err error - secretsMap := make(map[string]string) - injectSecretsLabel := make(map[string]string) - if taskTmpl.SecurityContext != nil && len(taskTmpl.SecurityContext.Secrets) > 0 { - secretsMap, err = secrets.MarshalSecretsToMapStrings(taskTmpl.SecurityContext.Secrets) - if err != nil { - return TaskExecutionMetadata{}, err - } - - injectSecretsLabel = map[string]string{ - secrets.PodLabel: secrets.PodLabelValue, - } - } - - return TaskExecutionMetadata{ - TaskExecutionMetadata: tCtx, - annotations: utils.UnionMaps(tCtx.GetAnnotations(), secretsMap), - labels: utils.UnionMaps(tCtx.GetLabels(), injectSecretsLabel), - }, nil -}*/ diff --git a/go/tasks/plugins/array/k8s/subtask_exec_context_test.go b/go/tasks/plugins/array/k8s/subtask_exec_context_test.go new file mode 100644 index 000000000..24ecbf425 --- /dev/null +++ b/go/tasks/plugins/array/k8s/subtask_exec_context_test.go @@ -0,0 +1,32 @@ +package k8s + +import ( + "context" + "fmt" + "testing" + + podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" + + "github.com/stretchr/testify/assert" +) + +func TestSubTaskExecutionContext(t *testing.T) { + ctx := context.Background() + + tCtx := getMockTaskExecutionContext(ctx, 0) + taskTemplate, err := tCtx.TaskReader().Read(ctx) + assert.Nil(t, err) + + executionIndex := 0 + originalIndex := 5 + retryAttempt := uint64(1) + + stCtx := newSubTaskExecutionContext(tCtx, taskTemplate, executionIndex, originalIndex, retryAttempt) + + assert.Equal(t, stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), fmt.Sprintf("notfound-%d-%d", executionIndex, retryAttempt)) + + subtaskTemplate, err := stCtx.TaskReader().Read(ctx) + assert.Nil(t, err) + assert.Equal(t, int32(2), subtaskTemplate.TaskTypeVersion) + assert.Equal(t, podPlugin.ContainerTaskType, subtaskTemplate.Type) +} From c86f3b35c2cdf1d7de16e2863d29fd32045e6328 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Wed, 16 Feb 2022 06:44:02 -0600 Subject: [PATCH 09/20] fixed existing unit tests Signed-off-by: Daniel Rammer --- go/tasks/logs/logging_utils_test.go | 60 +++++++++-------- go/tasks/plugins/k8s/pod/container_test.go | 23 +++---- go/tasks/plugins/k8s/pod/sidecar_test.go | 75 +++++++++------------- 3 files changed, 74 insertions(+), 84 deletions(-) diff --git a/go/tasks/logs/logging_utils_test.go b/go/tasks/logs/logging_utils_test.go index 3bc37a8e3..02f5d778b 100755 --- a/go/tasks/logs/logging_utils_test.go +++ b/go/tasks/logs/logging_utils_test.go @@ -15,29 +15,32 @@ import ( const podName = "PodName" func TestGetLogsForContainerInPod_NoPlugins(t *testing.T) { - assert.NoError(t, SetLogConfig(&LogConfig{})) - l, err := GetLogsForContainerInPod(context.TODO(), nil, 0, " Suffix") + logPlugin, err := InitializeLogPlugins(&LogConfig{}) + assert.NoError(t, err) + l, err := GetLogsForContainerInPod(context.TODO(), logPlugin, nil, 0, " Suffix") assert.NoError(t, err) assert.Nil(t, l) } func TestGetLogsForContainerInPod_NoLogs(t *testing.T) { - assert.NoError(t, SetLogConfig(&LogConfig{ + logPlugin, err := InitializeLogPlugins(&LogConfig{ IsCloudwatchEnabled: true, CloudwatchRegion: "us-east-1", CloudwatchLogGroup: "/kubernetes/flyte-production", - })) - p, err := GetLogsForContainerInPod(context.TODO(), nil, 0, " Suffix") + }) + assert.NoError(t, err) + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, nil, 0, " Suffix") assert.NoError(t, err) assert.Nil(t, p) } func TestGetLogsForContainerInPod_BadIndex(t *testing.T) { - assert.NoError(t, SetLogConfig(&LogConfig{ + logPlugin, err := InitializeLogPlugins(&LogConfig{ IsCloudwatchEnabled: true, CloudwatchRegion: "us-east-1", CloudwatchLogGroup: "/kubernetes/flyte-production", - })) + }) + assert.NoError(t, err) pod := &v1.Pod{ Spec: v1.PodSpec{ @@ -57,17 +60,18 @@ func TestGetLogsForContainerInPod_BadIndex(t *testing.T) { } pod.Name = podName - p, err := GetLogsForContainerInPod(context.TODO(), pod, 1, " Suffix") + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, pod, 1, " Suffix") assert.NoError(t, err) assert.Nil(t, p) } func TestGetLogsForContainerInPod_MissingStatus(t *testing.T) { - assert.NoError(t, SetLogConfig(&LogConfig{ + logPlugin, err := InitializeLogPlugins(&LogConfig{ IsCloudwatchEnabled: true, CloudwatchRegion: "us-east-1", CloudwatchLogGroup: "/kubernetes/flyte-production", - })) + }) + assert.NoError(t, err) pod := &v1.Pod{ Spec: v1.PodSpec{ @@ -81,16 +85,17 @@ func TestGetLogsForContainerInPod_MissingStatus(t *testing.T) { } pod.Name = podName - p, err := GetLogsForContainerInPod(context.TODO(), pod, 1, " Suffix") + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, pod, 1, " Suffix") assert.NoError(t, err) assert.Nil(t, p) } func TestGetLogsForContainerInPod_Cloudwatch(t *testing.T) { - assert.NoError(t, SetLogConfig(&LogConfig{IsCloudwatchEnabled: true, + logPlugin, err := InitializeLogPlugins(&LogConfig{IsCloudwatchEnabled: true, CloudwatchRegion: "us-east-1", CloudwatchLogGroup: "/kubernetes/flyte-production", - })) + }) + assert.NoError(t, err) pod := &v1.Pod{ Spec: v1.PodSpec{ @@ -110,16 +115,17 @@ func TestGetLogsForContainerInPod_Cloudwatch(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), pod, 0, " Suffix") + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, pod, 0, " Suffix") assert.Nil(t, err) assert.Len(t, logs, 1) } func TestGetLogsForContainerInPod_K8s(t *testing.T) { - assert.NoError(t, SetLogConfig(&LogConfig{ + logPlugin, err := InitializeLogPlugins(&LogConfig{ IsKubernetesEnabled: true, KubernetesURL: "k8s.com", - })) + }) + assert.NoError(t, err) pod := &v1.Pod{ Spec: v1.PodSpec{ @@ -139,19 +145,20 @@ func TestGetLogsForContainerInPod_K8s(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), pod, 0, " Suffix") + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, pod, 0, " Suffix") assert.Nil(t, err) assert.Len(t, logs, 1) } func TestGetLogsForContainerInPod_All(t *testing.T) { - assert.NoError(t, SetLogConfig(&LogConfig{ + logPlugin, err := InitializeLogPlugins(&LogConfig{ IsKubernetesEnabled: true, KubernetesURL: "k8s.com", IsCloudwatchEnabled: true, CloudwatchRegion: "us-east-1", CloudwatchLogGroup: "/kubernetes/flyte-production", - })) + }) + assert.NoError(t, err) pod := &v1.Pod{ Spec: v1.PodSpec{ @@ -171,18 +178,18 @@ func TestGetLogsForContainerInPod_All(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), pod, 0, " Suffix") + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, pod, 0, " Suffix") assert.Nil(t, err) assert.Len(t, logs, 2) } func TestGetLogsForContainerInPod_Stackdriver(t *testing.T) { - - assert.NoError(t, SetLogConfig(&LogConfig{ + logPlugin, err := InitializeLogPlugins(&LogConfig{ IsStackDriverEnabled: true, GCPProjectName: "myGCPProject", StackdriverLogResourceName: "aws_ec2_instance", - })) + }) + assert.NoError(t, err) pod := &v1.Pod{ Spec: v1.PodSpec{ @@ -202,7 +209,7 @@ func TestGetLogsForContainerInPod_Stackdriver(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), pod, 0, " Suffix") + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, pod, 0, " Suffix") assert.Nil(t, err) assert.Len(t, logs, 1) } @@ -252,7 +259,8 @@ func TestGetLogsForContainerInPod_LegacyTemplate(t *testing.T) { } func assertTestSucceeded(tb testing.TB, config *LogConfig, expectedTaskLogs []*core.TaskLog) { - assert.NoError(tb, SetLogConfig(config)) + logPlugin, err := InitializeLogPlugins(config) + assert.NoError(tb, err) pod := &v1.Pod{ ObjectMeta: v12.ObjectMeta{ @@ -275,7 +283,7 @@ func assertTestSucceeded(tb testing.TB, config *LogConfig, expectedTaskLogs []*c }, } - logs, err := GetLogsForContainerInPod(context.TODO(), pod, 0, " my-Suffix") + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, pod, 0, " my-Suffix") assert.Nil(tb, err) assert.Len(tb, logs, len(expectedTaskLogs)) if diff := deep.Equal(logs, expectedTaskLogs); len(diff) > 0 { diff --git a/go/tasks/plugins/k8s/pod/container_test.go b/go/tasks/plugins/k8s/pod/container_test.go index 3832b00f3..d4b907915 100644 --- a/go/tasks/plugins/k8s/pod/container_test.go +++ b/go/tasks/plugins/k8s/pod/container_test.go @@ -106,9 +106,8 @@ func dummyContainerTaskContext(resources *v1.ResourceRequirements, command []str } func TestContainerTaskExecutor_BuildIdentityResource(t *testing.T) { - p := plugin{defaultPodBuilder, podBuilders} taskMetadata := &pluginsCoreMock.TaskExecutionMetadata{} - r, err := p.BuildIdentityResource(context.TODO(), taskMetadata) + r, err := DefaultPodPlugin.BuildIdentityResource(context.TODO(), taskMetadata) assert.NoError(t, err) assert.NotNil(t, r) _, ok := r.(*v1.Pod) @@ -117,12 +116,11 @@ func TestContainerTaskExecutor_BuildIdentityResource(t *testing.T) { } func TestContainerTaskExecutor_BuildResource(t *testing.T) { - p := plugin{defaultPodBuilder, podBuilders} command := []string{"command"} args := []string{"{{.Input}}"} taskCtx := dummyContainerTaskContext(containerResourceRequirements, command, args) - r, err := p.BuildResource(context.TODO(), taskCtx) + r, err := DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.NoError(t, err) assert.NotNil(t, r) j, ok := r.(*v1.Pod) @@ -142,7 +140,6 @@ func TestContainerTaskExecutor_BuildResource(t *testing.T) { } func TestContainerTaskExecutor_GetTaskStatus(t *testing.T) { - p := plugin{defaultPodBuilder, podBuilders} j := &v1.Pod{ Status: v1.PodStatus{}, } @@ -150,21 +147,21 @@ func TestContainerTaskExecutor_GetTaskStatus(t *testing.T) { ctx := context.TODO() t.Run("running", func(t *testing.T) { j.Status.Phase = v1.PodRunning - phaseInfo, err := p.GetTaskPhase(ctx, nil, j) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(ctx, nil, j) assert.NoError(t, err) assert.Equal(t, pluginsCore.PhaseRunning, phaseInfo.Phase()) }) t.Run("queued", func(t *testing.T) { j.Status.Phase = v1.PodPending - phaseInfo, err := p.GetTaskPhase(ctx, nil, j) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(ctx, nil, j) assert.NoError(t, err) assert.Equal(t, pluginsCore.PhaseQueued, phaseInfo.Phase()) }) t.Run("failNoCondition", func(t *testing.T) { j.Status.Phase = v1.PodFailed - phaseInfo, err := p.GetTaskPhase(ctx, nil, j) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(ctx, nil, j) assert.NoError(t, err) assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase()) ec := phaseInfo.Err().GetCode() @@ -180,7 +177,7 @@ func TestContainerTaskExecutor_GetTaskStatus(t *testing.T) { Type: v1.PodReasonUnschedulable, }, } - phaseInfo, err := p.GetTaskPhase(ctx, nil, j) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(ctx, nil, j) assert.NoError(t, err) assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase()) ec := phaseInfo.Err().GetCode() @@ -189,7 +186,7 @@ func TestContainerTaskExecutor_GetTaskStatus(t *testing.T) { t.Run("success", func(t *testing.T) { j.Status.Phase = v1.PodSucceeded - phaseInfo, err := p.GetTaskPhase(ctx, nil, j) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(ctx, nil, j) assert.NoError(t, err) assert.NotNil(t, phaseInfo) assert.Equal(t, pluginsCore.PhaseSuccess, phaseInfo.Phase()) @@ -197,14 +194,12 @@ func TestContainerTaskExecutor_GetTaskStatus(t *testing.T) { } func TestContainerTaskExecutor_GetProperties(t *testing.T) { - p := plugin{defaultPodBuilder, podBuilders} expected := k8s.PluginProperties{} - assert.Equal(t, expected, p.GetProperties()) + assert.Equal(t, expected, DefaultPodPlugin.GetProperties()) } func TestContainerTaskExecutor_GetTaskStatus_InvalidImageName(t *testing.T) { ctx := context.TODO() - p := plugin{defaultPodBuilder, podBuilders} reason := "InvalidImageName" message := "Failed to apply default image tag \"TEST/flyteorg/myapp:latest\": couldn't parse image reference" + " \"TEST/flyteorg/myapp:latest\": invalid reference format: repository name must be lowercase" @@ -235,7 +230,7 @@ func TestContainerTaskExecutor_GetTaskStatus_InvalidImageName(t *testing.T) { t.Run("failInvalidImageName", func(t *testing.T) { pendingPod.Status.Phase = v1.PodPending - phaseInfo, err := p.GetTaskPhase(ctx, nil, pendingPod) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(ctx, nil, pendingPod) finalReason := fmt.Sprintf("|%s", reason) finalMessage := fmt.Sprintf("|%s", message) assert.NoError(t, err) diff --git a/go/tasks/plugins/k8s/pod/sidecar_test.go b/go/tasks/plugins/k8s/pod/sidecar_test.go index 77cb40afa..e301e5bf2 100644 --- a/go/tasks/plugins/k8s/pod/sidecar_test.go +++ b/go/tasks/plugins/k8s/pod/sidecar_test.go @@ -51,7 +51,7 @@ func getSidecarTaskTemplateForTest(sideCarJob sidecarJob) *core.TaskTemplate { panic(err) } return &core.TaskTemplate{ - Type: sidecarTaskType, + Type: SidecarTaskType, Custom: &structObj, } } @@ -199,10 +199,10 @@ func TestBuildSidecarResource_TaskType2(t *testing.T) { } task := core.TaskTemplate{ - Type: sidecarTaskType, + Type: SidecarTaskType, TaskTypeVersion: 2, Config: map[string]string{ - primaryContainerKey: "primary container", + PrimaryContainerKey: "primary container", }, Target: &core.TaskTemplate_K8SPod{ K8SPod: &core.K8SPod{ @@ -241,12 +241,11 @@ func TestBuildSidecarResource_TaskType2(t *testing.T) { DefaultMemoryRequest: resource.MustParse("1024Mi"), GpuResourceName: ResourceNvidiaGPU, })) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&task, sidecarResourceRequirements) - res, err := p.BuildResource(context.TODO(), taskCtx) + res, err := DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.Nil(t, err) assert.EqualValues(t, map[string]string{ - primaryContainerKey: "primary container", + PrimaryContainerKey: "primary container", "anno": "bar", }, res.GetAnnotations()) assert.EqualValues(t, map[string]string{ @@ -284,10 +283,10 @@ func TestBuildSidecarResource_TaskType2(t *testing.T) { func TestBuildSidecarResource_TaskType2_Invalid_Spec(t *testing.T) { task := core.TaskTemplate{ - Type: sidecarTaskType, + Type: SidecarTaskType, TaskTypeVersion: 2, Config: map[string]string{ - primaryContainerKey: "primary container", + PrimaryContainerKey: "primary container", }, Target: &core.TaskTemplate_K8SPod{ K8SPod: &core.K8SPod{ @@ -303,9 +302,8 @@ func TestBuildSidecarResource_TaskType2_Invalid_Spec(t *testing.T) { }, } - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&task, sidecarResourceRequirements) - _, err := p.BuildResource(context.TODO(), taskCtx) + _, err := DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.EqualError(t, err, "[BadTaskSpecification] Pod tasks with task type version > 1 should specify their target as a K8sPod with a defined pod spec") } @@ -323,11 +321,11 @@ func TestBuildSidecarResource_TaskType1(t *testing.T) { } task := core.TaskTemplate{ - Type: sidecarTaskType, + Type: SidecarTaskType, Custom: structObj, TaskTypeVersion: 1, Config: map[string]string{ - primaryContainerKey: "primary container", + PrimaryContainerKey: "primary container", }, } @@ -352,12 +350,11 @@ func TestBuildSidecarResource_TaskType1(t *testing.T) { DefaultCPURequest: resource.MustParse("1024m"), DefaultMemoryRequest: resource.MustParse("1024Mi"), })) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&task, sidecarResourceRequirements) - res, err := p.BuildResource(context.TODO(), taskCtx) + res, err := DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.Nil(t, err) assert.EqualValues(t, map[string]string{ - primaryContainerKey: "primary container", + PrimaryContainerKey: "primary container", }, res.GetAnnotations()) assert.EqualValues(t, map[string]string{}, res.GetLabels()) @@ -405,7 +402,7 @@ func TestBuildSideResource_TaskType1_InvalidSpec(t *testing.T) { } task := core.TaskTemplate{ - Type: sidecarTaskType, + Type: SidecarTaskType, Custom: structObj, TaskTypeVersion: 1, } @@ -418,16 +415,15 @@ func TestBuildSideResource_TaskType1_InvalidSpec(t *testing.T) { DefaultCPURequest: resource.MustParse("1024m"), DefaultMemoryRequest: resource.MustParse("1024Mi"), })) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&task, sidecarResourceRequirements) - _, err = p.BuildResource(context.TODO(), taskCtx) + _, err = DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.EqualError(t, err, "[BadTaskSpecification] invalid TaskSpecification, config needs to be non-empty and include missing [primary_container_name] key") task.Config = map[string]string{ "foo": "bar", } taskCtx = getDummySidecarTaskContext(&task, sidecarResourceRequirements) - _, err = p.BuildResource(context.TODO(), taskCtx) + _, err = DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.EqualError(t, err, "[BadTaskSpecification] invalid TaskSpecification, config missing [primary_container_name] key in [map[foo:bar]]") } @@ -446,7 +442,7 @@ func TestBuildSidecarResource(t *testing.T) { t.Fatal(err) } task := core.TaskTemplate{ - Type: sidecarTaskType, + Type: SidecarTaskType, Custom: &sidecarCustom, } @@ -471,12 +467,11 @@ func TestBuildSidecarResource(t *testing.T) { DefaultCPURequest: resource.MustParse("1024m"), DefaultMemoryRequest: resource.MustParse("1024Mi"), })) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&task, sidecarResourceRequirements) - res, err := p.BuildResource(context.TODO(), taskCtx) + res, err := DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.Nil(t, err) assert.EqualValues(t, map[string]string{ - primaryContainerKey: "a container", + PrimaryContainerKey: "a container", "a1": "a1", }, res.GetAnnotations()) @@ -528,9 +523,8 @@ func TestBuildSidecarReosurceMissingAnnotationsAndLabels(t *testing.T) { task := getSidecarTaskTemplateForTest(sideCarJob) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(task, sidecarResourceRequirements) - resp, err := p.BuildResource(context.TODO(), taskCtx) + resp, err := DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.NoError(t, err) assert.EqualValues(t, map[string]string{}, resp.GetLabels()) assert.EqualValues(t, map[string]string{"primary_container_name": "PrimaryContainer"}, resp.GetAnnotations()) @@ -550,9 +544,8 @@ func TestBuildSidecarResourceMissingPrimary(t *testing.T) { task := getSidecarTaskTemplateForTest(sideCarJob) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(task, sidecarResourceRequirements) - _, err := p.BuildResource(context.TODO(), taskCtx) + _, err := DefaultPodPlugin.BuildResource(context.TODO(), taskCtx) assert.True(t, errors.Is(err, errors2.Errorf("BadTaskSpecification", ""))) } @@ -584,11 +577,10 @@ func TestGetTaskSidecarStatus(t *testing.T) { }, } res.SetAnnotations(map[string]string{ - primaryContainerKey: "PrimaryContainer", + PrimaryContainerKey: "PrimaryContainer", }) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(task, sidecarResourceRequirements) - phaseInfo, err := p.GetTaskPhase(context.TODO(), taskCtx, res) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(context.TODO(), taskCtx, res) assert.Nil(t, err) assert.Equal(t, expectedTaskPhase, phaseInfo.Phase(), "Expected [%v] got [%v] instead, for podPhase [%v]", expectedTaskPhase, phaseInfo.Phase(), podPhase) @@ -612,11 +604,10 @@ func TestDemystifiedSidecarStatus_PrimaryFailed(t *testing.T) { }, } res.SetAnnotations(map[string]string{ - primaryContainerKey: "Primary", + PrimaryContainerKey: "Primary", }) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&core.TaskTemplate{}, sidecarResourceRequirements) - phaseInfo, err := p.GetTaskPhase(context.TODO(), taskCtx, res) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(context.TODO(), taskCtx, res) assert.Nil(t, err) assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase()) } @@ -638,11 +629,10 @@ func TestDemystifiedSidecarStatus_PrimarySucceeded(t *testing.T) { }, } res.SetAnnotations(map[string]string{ - primaryContainerKey: "Primary", + PrimaryContainerKey: "Primary", }) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&core.TaskTemplate{}, sidecarResourceRequirements) - phaseInfo, err := p.GetTaskPhase(context.TODO(), taskCtx, res) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(context.TODO(), taskCtx, res) assert.Nil(t, err) assert.Equal(t, pluginsCore.PhaseSuccess, phaseInfo.Phase()) } @@ -664,11 +654,10 @@ func TestDemystifiedSidecarStatus_PrimaryRunning(t *testing.T) { }, } res.SetAnnotations(map[string]string{ - primaryContainerKey: "Primary", + PrimaryContainerKey: "Primary", }) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&core.TaskTemplate{}, sidecarResourceRequirements) - phaseInfo, err := p.GetTaskPhase(context.TODO(), taskCtx, res) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(context.TODO(), taskCtx, res) assert.Nil(t, err) assert.Equal(t, pluginsCore.PhaseRunning, phaseInfo.Phase()) } @@ -685,17 +674,15 @@ func TestDemystifiedSidecarStatus_PrimaryMissing(t *testing.T) { }, } res.SetAnnotations(map[string]string{ - primaryContainerKey: "Primary", + PrimaryContainerKey: "Primary", }) - p := &plugin{defaultPodBuilder, podBuilders} taskCtx := getDummySidecarTaskContext(&core.TaskTemplate{}, sidecarResourceRequirements) - phaseInfo, err := p.GetTaskPhase(context.TODO(), taskCtx, res) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(context.TODO(), taskCtx, res) assert.Nil(t, err) assert.Equal(t, pluginsCore.PhasePermanentFailure, phaseInfo.Phase()) } func TestGetProperties(t *testing.T) { - p := &plugin{defaultPodBuilder, podBuilders} expected := k8s.PluginProperties{} - assert.Equal(t, expected, p.GetProperties()) + assert.Equal(t, expected, DefaultPodPlugin.GetProperties()) } From 0f47b024832f8d7e3c73e93308daf5e23c803e48 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Wed, 16 Feb 2022 06:55:04 -0600 Subject: [PATCH 10/20] fixed lint issues Signed-off-by: Daniel Rammer --- .../core/mocks/fake_k8s_client.go | 2 +- go/tasks/plugins/array/k8s/executor.go | 1 - go/tasks/plugins/array/k8s/management.go | 4 +- go/tasks/plugins/array/k8s/management_test.go | 43 ++++++++++--------- go/tasks/plugins/array/k8s/subtask.go | 10 ++--- .../plugins/array/k8s/subtask_exec_context.go | 13 +++--- go/tasks/plugins/k8s/pod/plugin.go | 2 +- 7 files changed, 37 insertions(+), 38 deletions(-) diff --git a/go/tasks/pluginmachinery/core/mocks/fake_k8s_client.go b/go/tasks/pluginmachinery/core/mocks/fake_k8s_client.go index 3d8278967..120b9c6d7 100644 --- a/go/tasks/pluginmachinery/core/mocks/fake_k8s_client.go +++ b/go/tasks/pluginmachinery/core/mocks/fake_k8s_client.go @@ -92,7 +92,7 @@ func (m *FakeKubeClient) Create(ctx context.Context, obj client.Object, opts ... // if obj is a *v1.Pod then append a ContainerStatus for each Container pod, ok := obj.(*v1.Pod) if ok { - for i, _ := range pod.Spec.Containers { + for i := range pod.Spec.Containers { if len(pod.Status.ContainerStatuses) > i { continue } diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index ecb2aeb44..c90707684 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -16,7 +16,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" - ) const executorName = "k8s-array" diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index 070edf78f..b8f42ddfc 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -94,7 +94,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon } // Initialize subtask retryAttempts to 0 so that, in tandem with the podName logic, we - // maintain backwards compatability. + // maintain backwards compatibility. for i := 0; i < currentState.GetExecutionArraySize(); i++ { retryAttemptsArray.SetItem(i, 0) } @@ -156,7 +156,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon } // create subtask - launchSubtask(ctx, stCtx, config, kubeClient) + err = launchSubtask(ctx, stCtx, config, kubeClient) if err != nil && !k8serrors.IsAlreadyExists(err) { if k8serrors.IsForbidden(err) { if strings.Contains(err.Error(), "exceeded quota") { diff --git a/go/tasks/plugins/array/k8s/management_test.go b/go/tasks/plugins/array/k8s/management_test.go index d76f6c60d..6b55b0053 100644 --- a/go/tasks/plugins/array/k8s/management_test.go +++ b/go/tasks/plugins/array/k8s/management_test.go @@ -126,30 +126,30 @@ func TestCheckSubTasksState(t *testing.T) { fakeKubeClient := mocks.NewFakeKubeClient() fakeKubeCache := mocks.NewFakeKubeCache() - for i:=0; i Date: Wed, 16 Feb 2022 08:44:48 -0600 Subject: [PATCH 11/20] added function comments to management.go Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index b8f42ddfc..adb456fec 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -22,6 +22,8 @@ import ( k8serrors "k8s.io/apimachinery/pkg/api/errors" ) +// allocateResource attempts to allot resources for the specified parameter with the +// TaskExecutionContexts ResourceManager. func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) (core.AllocationStatus, error) { if !IsResourceConfigSet(config.ResourceConfig) { return core.AllocationStatusGranted, nil @@ -41,6 +43,8 @@ func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, confi return allocationStatus, nil } +// deallocateResource attempts to release resources for the specified parameter with the +// TaskExecutionContexts ResourceManager. func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) error { if !IsResourceConfigSet(config.ResourceConfig) { return nil @@ -56,6 +60,9 @@ func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, con return nil } +// LaunchAndCheckSubTasksState iterates over each subtask performing operations to transition them +// to a terminal state. This may include creating new k8s resources, monitoring exising k8s +// resources, retrying failed attempts, or declaring a permanent failure among others. func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference, currentState *arrayCore.State) ( newState *arrayCore.State, logLinks []*idlCore.TaskLog, subTaskIDs []*string, err error) { @@ -158,6 +165,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon // create subtask err = launchSubtask(ctx, stCtx, config, kubeClient) if err != nil && !k8serrors.IsAlreadyExists(err) { + // TODO check this if k8serrors.IsForbidden(err) { if strings.Contains(err.Error(), "exceeded quota") { // TODO: Quota errors are retried forever, it would be good to have support for backoff strategy. @@ -257,6 +265,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon return newState, logLinks, subTaskIDs, nil } +// TerminateSubTasks performs operations to gracefully shutdown all subtasks. This may include +// aborting and finalizing active k8s resources. func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, currentState *arrayCore.State) error { // TODO - fix /*size := currentState.GetExecutionArraySize() From 67ddc74b0cedb8effd144999d60bc3c19e30f55a Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Wed, 16 Feb 2022 09:38:05 -0600 Subject: [PATCH 12/20] fixed phase transitions from failures when launching or monitoring subtasks Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 44 ++++--------- go/tasks/plugins/array/k8s/management_test.go | 12 ++-- go/tasks/plugins/array/k8s/subtask.go | 64 +++++++++++++------ 3 files changed, 62 insertions(+), 58 deletions(-) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index adb456fec..c932db062 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -3,7 +3,7 @@ package k8s import ( "context" "fmt" - "strings" + "time" idlCore "github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core" @@ -18,8 +18,6 @@ import ( "github.com/flyteorg/flytestdlib/bitarray" "github.com/flyteorg/flytestdlib/logger" "github.com/flyteorg/flytestdlib/storage" - - k8serrors "k8s.io/apimachinery/pkg/api/errors" ) // allocateResource attempts to allot resources for the specified parameter with the @@ -61,7 +59,7 @@ func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, con } // LaunchAndCheckSubTasksState iterates over each subtask performing operations to transition them -// to a terminal state. This may include creating new k8s resources, monitoring exising k8s +// to a terminal state. This may include creating new k8s resources, monitoring existing k8s // resources, retrying failed attempts, or declaring a permanent failure among others. func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference, currentState *arrayCore.State) ( @@ -147,6 +145,10 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon stCtx := newSubTaskExecutionContext(tCtx, taskTemplate, childIdx, originalIdx, retryAttempt) podName := stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName() + // depending on the existing subtask phase we either a launch new k8s resource or monitor + // an existing instance + var phaseInfo core.PhaseInfo + var perr error if existingPhase == core.PhaseUndefined || existingPhase == core.PhaseWaitingForResources || existingPhase == core.PhaseRetryableFailure { // attempt to allocateResource allocationStatus, err := allocateResource(ctx, stCtx, config, podName) @@ -158,35 +160,17 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) if allocationStatus != core.AllocationStatusGranted { - newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseWaitingForResources)) - continue - } - - // create subtask - err = launchSubtask(ctx, stCtx, config, kubeClient) - if err != nil && !k8serrors.IsAlreadyExists(err) { - // TODO check this - if k8serrors.IsForbidden(err) { - if strings.Contains(err.Error(), "exceeded quota") { - // TODO: Quota errors are retried forever, it would be good to have support for backoff strategy. - logger.Infof(ctx, "Failed to launch job, resource quota exceeded. Err: %v", err) - newState = newState.SetPhase(arrayCore.PhaseWaitingForResources, 0).SetReason("Not enough resources to launch job") - } else { - newState = newState.SetPhase(arrayCore.PhaseRetryableFailure, 0).SetReason("Failed to launch job.") - } - - newState.SetReason(err.Error()) - return newState, logLinks, subTaskIDs, nil - } - - return currentState, logLinks, subTaskIDs, err + phaseInfo = core.PhaseInfoWaitingForResourcesInfo(time.Now(), core.DefaultPhaseVersion, "Exceeded ResourceManager quota", nil) + } else { + phaseInfo, perr = launchSubtask(ctx, stCtx, config, kubeClient) } + } else { + phaseInfo, perr = getSubtaskPhaseInfo(ctx, stCtx, config, kubeClient, logPlugin) } - // monitor pod - phaseInfo, err := getSubtaskPhaseInfo(ctx, stCtx, config, kubeClient, logPlugin) - if err != nil { - return currentState, logLinks, subTaskIDs, err + // validate and process phaseInfo and perr + if perr != nil { + return currentState, logLinks, subTaskIDs, perr } if phaseInfo.Err() != nil { diff --git a/go/tasks/plugins/array/k8s/management_test.go b/go/tasks/plugins/array/k8s/management_test.go index 6b55b0053..8e5cd6b1e 100644 --- a/go/tasks/plugins/array/k8s/management_test.go +++ b/go/tasks/plugins/array/k8s/management_test.go @@ -183,7 +183,7 @@ func TestCheckSubTasksState(t *testing.T) { assert.Equal(t, arrayCore.PhaseCheckingSubTaskExecutions.String(), p.String()) resourceManager.AssertNumberOfCalls(t, "AllocateResource", subtaskCount) for _, subtaskPhaseIndex := range newState.GetArrayStatus().Detailed.GetItems() { - assert.Equal(t, core.PhaseRunning, core.Phases[subtaskPhaseIndex]) + assert.Equal(t, core.PhaseQueued, core.Phases[subtaskPhaseIndex]) } }) @@ -254,7 +254,7 @@ func TestCheckSubTasksState(t *testing.T) { } // execute - newState, _, subTaskIDs, err := LaunchAndCheckSubTasksState(ctx, tCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", currentState) + newState, _, _, err := LaunchAndCheckSubTasksState(ctx, tCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", currentState) // validate results assert.Nil(t, err) @@ -264,7 +264,6 @@ func TestCheckSubTasksState(t *testing.T) { for _, subtaskPhaseIndex := range newState.GetArrayStatus().Detailed.GetItems() { assert.Equal(t, core.PhaseWaitingForResources, core.Phases[subtaskPhaseIndex]) } - assert.Empty(t, subTaskIDs, "subtask ids are only populated when monitor is called for a successfully launched task") // execute again - with resources available and validate results nresourceManager := mocks.ResourceManager{} @@ -273,15 +272,14 @@ func TestCheckSubTasksState(t *testing.T) { ntCtx := getMockTaskExecutionContext(ctx, 0) ntCtx.OnResourceManager().Return(&nresourceManager) - lastState, _, subTaskIDs, err := LaunchAndCheckSubTasksState(ctx, ntCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", newState) + lastState, _, _, err := LaunchAndCheckSubTasksState(ctx, ntCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", newState) assert.Nil(t, err) np, _ := lastState.GetPhase() assert.Equal(t, arrayCore.PhaseCheckingSubTaskExecutions.String(), np.String()) resourceManager.AssertNumberOfCalls(t, "AllocateResource", subtaskCount) for _, subtaskPhaseIndex := range lastState.GetArrayStatus().Detailed.GetItems() { - assert.Equal(t, core.PhaseRunning, core.Phases[subtaskPhaseIndex]) + assert.Equal(t, core.PhaseQueued, core.Phases[subtaskPhaseIndex]) } - assert.Equal(t, subtaskCount, len(subTaskIDs)) }) t.Run("LaunchRetryableFailures", func(t *testing.T) { @@ -325,7 +323,7 @@ func TestCheckSubTasksState(t *testing.T) { assert.Equal(t, arrayCore.PhaseCheckingSubTaskExecutions.String(), p.String()) resourceManager.AssertNumberOfCalls(t, "AllocateResource", subtaskCount) for i, subtaskPhaseIndex := range newState.GetArrayStatus().Detailed.GetItems() { - assert.Equal(t, core.PhaseRunning, core.Phases[subtaskPhaseIndex]) + assert.Equal(t, core.PhaseQueued, core.Phases[subtaskPhaseIndex]) assert.Equal(t, bitarray.Item(1), newState.RetryAttempts.GetItem(i)) } }) diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index c89eab48f..5a4ff6069 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -5,14 +5,18 @@ import ( "fmt" "regexp" "strconv" + "strings" + "time" + "github.com/flyteorg/flyteplugins/go/tasks/errors" pluginsCore "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s/config" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils" podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" - errors2 "github.com/flyteorg/flytestdlib/errors" + stdErrors "github.com/flyteorg/flytestdlib/errors" + "github.com/flyteorg/flytestdlib/logger" v1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -21,11 +25,11 @@ import ( ) const ( - ErrBuildPodTemplate errors2.ErrorCode = "POD_TEMPLATE_FAILED" - ErrReplaceCmdTemplate errors2.ErrorCode = "CMD_TEMPLATE_FAILED" - FlyteK8sArrayIndexVarName string = "FLYTE_K8S_ARRAY_INDEX" - finalizer string = "flyte/array" - JobIndexVarName string = "BATCH_JOB_ARRAY_INDEX_VAR_NAME" + ErrBuildPodTemplate stdErrors.ErrorCode = "POD_TEMPLATE_FAILED" + ErrReplaceCmdTemplate stdErrors.ErrorCode = "CMD_TEMPLATE_FAILED" + FlyteK8sArrayIndexVarName string = "FLYTE_K8S_ARRAY_INDEX" + finalizer string = "flyte/array" + JobIndexVarName string = "BATCH_JOB_ARRAY_INDEX_VAR_NAME" ) var ( @@ -84,23 +88,23 @@ func addMetadata(stCtx SubTaskExecutionContext, cfg *Config, pod *v1.Pod) { return nil }*/ -func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, config *Config, kubeClient pluginsCore.KubeClient) error { +func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, config *Config, kubeClient pluginsCore.KubeClient) (pluginsCore.PhaseInfo, error) { o, err := podPlugin.DefaultPodPlugin.BuildResource(ctx, stCtx) pod := o.(*v1.Pod) if err != nil { - return err + return pluginsCore.PhaseInfoUndefined, err } addMetadata(stCtx, config, pod) // inject maptask specific container environment variables if len(pod.Spec.Containers) == 0 { - return errors2.Wrapf(ErrReplaceCmdTemplate, err, "No containers found in podSpec.") + return pluginsCore.PhaseInfoUndefined, stdErrors.Wrapf(ErrReplaceCmdTemplate, err, "No containers found in podSpec.") } containerIndex, err := getTaskContainerIndex(pod) if err != nil { - return err + return pluginsCore.PhaseInfoUndefined, err } pod.Spec.Containers[containerIndex].Env = append(pod.Spec.Containers[containerIndex].Env, v1.EnvVar{ @@ -112,7 +116,28 @@ func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, config *C pod.Spec.Containers[containerIndex].Env = append(pod.Spec.Containers[containerIndex].Env, arrayJobEnvVars...) - return kubeClient.GetClient().Create(ctx, pod) + logger.Infof(ctx, "Creating Object: Type:[%v], Object:[%v/%v]", pod.GetObjectKind().GroupVersionKind(), pod.GetNamespace(), pod.GetName()) + err = kubeClient.GetClient().Create(ctx, pod) + if err != nil && !k8serrors.IsAlreadyExists(err) { + if k8serrors.IsForbidden(err) { + if strings.Contains(err.Error(), "exceeded quota") { + logger.Warnf(ctx, "Failed to launch job, resource quota exceeded and the operation is not guarded by back-off. err: %v", err) + return pluginsCore.PhaseInfoWaitingForResourcesInfo(time.Now(), pluginsCore.DefaultPhaseVersion, fmt.Sprintf("Exceeded resourcequota: %s", err.Error()), nil), nil + } + return pluginsCore.PhaseInfoRetryableFailure("RuntimeFailure", err.Error(), nil), nil + } else if k8serrors.IsBadRequest(err) || k8serrors.IsInvalid(err) { + logger.Errorf(ctx, "Badly formatted resource for plugin [%s], err %s", executorName, err) + // return pluginsCore.DoTransition(pluginsCore.PhaseInfoFailure("BadTaskFormat", err.Error(), nil)), nil + } else if k8serrors.IsRequestEntityTooLargeError(err) { + logger.Errorf(ctx, "Badly formatted resource for plugin [%s], err %s", executorName, err) + return pluginsCore.PhaseInfoFailure("EntityTooLarge", err.Error(), nil), nil + } + reason := k8serrors.ReasonForError(err) + logger.Errorf(ctx, "Failed to launch job, system error. err: %v", err) + return pluginsCore.PhaseInfoUndefined, errors.Wrapf(stdErrors.ErrorCode(reason), err, "failed to create resource") + } + + return pluginsCore.PhaseInfoQueued(time.Now(), pluginsCore.DefaultPhaseVersion, "task submitted to K8s"), nil } /*func finalizeSubtask() error { @@ -123,7 +148,8 @@ func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, config *C func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, config *Config, kubeClient pluginsCore.KubeClient, logPlugin tasklog.Plugin) (pluginsCore.PhaseInfo, error) { o, err := podPlugin.DefaultPodPlugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) if err != nil { - return pluginsCore.PhaseInfoUndefined, err + logger.Errorf(ctx, "Failed to build the Resource with name: %v. Error: %v", stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), err) + return pluginsCore.PhaseInfoFailure("BadTaskDefinition", fmt.Sprintf("Failed to build resource, caused by: %s", err.Error()), nil), nil } pod := o.(*v1.Pod) @@ -135,23 +161,19 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con if isK8sObjectNotExists(err) { // This happens sometimes because a node gets removed and K8s deletes the pod. This will result in a // Pod does not exist error. This should be retried using the retry policy - //logger.Warningf(ctx, "Failed to find the Resource with name: %v. Error: %v", nsName, err) // TODO - log + logger.Warnf(ctx, "Failed to find the Resource with name: %v. Error: %v", nsName, err) failureReason := fmt.Sprintf("resource not found, name [%s]. reason: %s", nsName.String(), err.Error()) - //return pluginsCore.DoTransition(pluginsCore.PhaseInfoSystemRetryableFailure("ResourceDeletedExternally", failureReason, nil)), nil - - // TODO - validate? - // return pluginsCore.PhaseInfoUndefined, err? return pluginsCore.PhaseInfoSystemRetryableFailure("ResourceDeletedExternally", failureReason, nil), nil } - //logger.Warningf(ctx, "Failed to retrieve Resource Details with name: %v. Error: %v", nsName, err) - // TODO - validate? + logger.Warnf(ctx, "Failed to retrieve Resource Details with name: %v. Error: %v", nsName, err) return pluginsCore.PhaseInfoUndefined, err } stID, _ := stCtx.TaskExecutionMetadata().GetTaskExecutionID().(SubTaskExecutionID) phaseInfo, err := podPlugin.DefaultPodPlugin.GetTaskPhaseWithLogs(ctx, stCtx, pod, logPlugin, stID.GetLogSuffix()) if err != nil { + logger.Warnf(ctx, "failed to check status of resource in plugin [%s], with error: %s", executorName, err.Error()) return pluginsCore.PhaseInfoUndefined, err } @@ -173,7 +195,7 @@ func getTaskContainerIndex(pod *v1.Pod) (int, error) { return 0, nil } // For tasks with a K8sPod task target, they may produce multiple containers but at least one must be the designated primary. - return -1, errors2.Errorf(ErrBuildPodTemplate, "Expected a specified primary container key when building an array job with a K8sPod spec target") + return -1, stdErrors.Errorf(ErrBuildPodTemplate, "Expected a specified primary container key when building an array job with a K8sPod spec target") } @@ -182,7 +204,7 @@ func getTaskContainerIndex(pod *v1.Pod) (int, error) { return idx, nil } } - return -1, errors2.Errorf(ErrBuildPodTemplate, "Couldn't find any container matching the primary container key when building an array job with a K8sPod spec target") + return -1, stdErrors.Errorf(ErrBuildPodTemplate, "Couldn't find any container matching the primary container key when building an array job with a K8sPod spec target") } func isK8sObjectNotExists(err error) bool { From f707820c6d1416d6785e9399466e32d6a73a21fd Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Thu, 17 Feb 2022 09:07:37 -0600 Subject: [PATCH 13/20] added finalizeSubtask function Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 36 +++++++- go/tasks/plugins/array/k8s/subtask.go | 109 ++++++++++++++++++++--- 2 files changed, 133 insertions(+), 12 deletions(-) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index c932db062..a8b9096e5 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -163,6 +163,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon phaseInfo = core.PhaseInfoWaitingForResourcesInfo(time.Now(), core.DefaultPhaseVersion, "Exceeded ResourceManager quota", nil) } else { phaseInfo, perr = launchSubtask(ctx, stCtx, config, kubeClient) + // TODO if this fails do we need to deallocate the resource to mitigate leaks? } } else { phaseInfo, perr = getSubtaskPhaseInfo(ctx, stCtx, config, kubeClient, logPlugin) @@ -206,7 +207,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon return currentState, logLinks, subTaskIDs, err } - // TODO - finalize and delete resource? + // TODO - finalize resource? } // validate parallelism @@ -252,6 +253,37 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon // TerminateSubTasks performs operations to gracefully shutdown all subtasks. This may include // aborting and finalizing active k8s resources. func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, currentState *arrayCore.State) error { + taskTemplate, err := tCtx.TaskReader().Read(ctx) + if err != nil { + return err + } else if taskTemplate == nil { + return errors.Errorf(errors.BadTaskSpecification, "Required value not set, taskTemplate is nil") + } + + messageCollector := errorcollector.NewErrorMessageCollector() + for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() { + existingPhase := core.Phases[existingPhaseIdx] + retryAttempt := currentState.RetryAttempts.GetItem(childIdx) + + // return immediately if subtask has completed or not yet started + if existingPhase.IsTerminal() || existingPhase == core.PhaseUndefined { + continue + } + + originalIdx := arrayCore.CalculateOriginalIndex(childIdx, currentState.GetIndexesToCache()) + stCtx := newSubTaskExecutionContext(tCtx, taskTemplate, childIdx, originalIdx, retryAttempt) + + err := finalizeSubtask(ctx, stCtx, config, kubeClient) + if err != nil { + messageCollector.Collect(childIdx, err.Error()) + } + } + + if messageCollector.Length() > 0 { + return fmt.Errorf(messageCollector.Summary(config.MaxErrorStringLength)) + } + + return nil // TODO - fix /*size := currentState.GetExecutionArraySize() messageCollector := errorcollector.NewErrorMessageCollector() @@ -276,5 +308,5 @@ func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kube return fmt.Errorf(errs.Summary(config.MaxErrorStringLength)) }*/ - return nil + //return nil } diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index 5a4ff6069..92da35839 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -22,6 +22,8 @@ import ( k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8stypes "k8s.io/apimachinery/pkg/types" + + "sigs.k8s.io/controller-runtime/pkg/client" ) const ( @@ -42,8 +44,9 @@ var ( namespaceRegex = regexp.MustCompile("(?i){{.namespace}}(?i)") ) -func addMetadata(stCtx SubTaskExecutionContext, cfg *Config, pod *v1.Pod) { - k8sPluginCfg := config.GetK8sPluginConfig() +// addMetadata sets k8s pod metadata that is either specifically required by the k8s array plugin +// or defined in the plugin configuration. +func addMetadata(stCtx SubTaskExecutionContext, cfg *Config, k8sPluginCfg *config.K8sPluginConfig, pod *v1.Pod) { taskExecutionMetadata := stCtx.TaskExecutionMetadata() // Default to parent namespace @@ -88,14 +91,30 @@ func addMetadata(stCtx SubTaskExecutionContext, cfg *Config, pod *v1.Pod) { return nil }*/ -func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, config *Config, kubeClient pluginsCore.KubeClient) (pluginsCore.PhaseInfo, error) { +// clearFinalizers removes finalizers (if they exist) from the k8s resource +func clearFinalizers(ctx context.Context, o client.Object, kubeClient pluginsCore.KubeClient) error { + if len(o.GetFinalizers()) > 0 { + o.SetFinalizers([]string{}) + err := kubeClient.GetClient().Update(ctx, o) + if err != nil && !isK8sObjectNotExists(err) { + logger.Warningf(ctx, "Failed to clear finalizers for Resource with name: %v/%v. Error: %v", o.GetNamespace(), o.GetName(), err) + return err + } + } else { + logger.Debugf(ctx, "Finalizers are already empty for Resource with name: %v/%v", o.GetNamespace(), o.GetName()) + } + return nil +} + +// launchSubtask creates a k8s pod defined by the SubTaskExecutionContext and Config. +func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Config, kubeClient pluginsCore.KubeClient) (pluginsCore.PhaseInfo, error) { o, err := podPlugin.DefaultPodPlugin.BuildResource(ctx, stCtx) pod := o.(*v1.Pod) if err != nil { return pluginsCore.PhaseInfoUndefined, err } - addMetadata(stCtx, config, pod) + addMetadata(stCtx, cfg, config.GetK8sPluginConfig(), pod) // inject maptask specific container environment variables if len(pod.Spec.Containers) == 0 { @@ -140,12 +159,80 @@ func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, config *C return pluginsCore.PhaseInfoQueued(time.Now(), pluginsCore.DefaultPhaseVersion, "task submitted to K8s"), nil } -/*func finalizeSubtask() error { - // TODO - return nil -}*/ +// finalizeSubtask performs operations to complete the k8s pod defined by the SubTaskExecutionContext +// and Config. These may include removing finalizers and deleting the k8s resource. +func finalizeSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Config, kubeClient pluginsCore.KubeClient) error { + errs := stdErrors.ErrorCollection{} + var o client.Object + var nsName k8stypes.NamespacedName + k8sPluginCfg := config.GetK8sPluginConfig() + if k8sPluginCfg.InjectFinalizer || k8sPluginCfg.DeleteResourceOnFinalize { + /*o, err = e.plugin.BuildIdentityResource(ctx, tCtx.TaskExecutionMetadata()) + if err != nil { + // This will recurrent, so we will skip further finalize + logger.Errorf(ctx, "Failed to build the Resource with name: %v. Error: %v, when finalizing.", tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), err) + return nil + } + + e.AddObjectMetadata(tCtx.TaskExecutionMetadata(), o, config.GetK8sPluginConfig())*/ + o, err := podPlugin.DefaultPodPlugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) + if err != nil { + // This will recurrent, so we will skip further finalize + logger.Errorf(ctx, "Failed to build the Resource with name: %v. Error: %v, when finalizing.", stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), err) + return nil + } + + addMetadata(stCtx, cfg, config.GetK8sPluginConfig(), o.(*v1.Pod)) + nsName = k8stypes.NamespacedName{Namespace: o.GetNamespace(), Name: o.GetName()} + } + + // In InjectFinalizer is on, it means we may have added the finalizers when we launched this resource. Attempt to + // clear them to allow the object to be deleted/garbage collected. If InjectFinalizer was turned on (through config) + // after the resource was created, we will not find any finalizers to clear and the object may have already been + // deleted at this point. Therefore, account for these cases and do not consider them errors. + if k8sPluginCfg.InjectFinalizer { + // Attempt to get resource from informer cache, if not found, retrieve it from API server. + if err := kubeClient.GetClient().Get(ctx, nsName, o); err != nil { + if isK8sObjectNotExists(err) { + return nil + } + // This happens sometimes because a node gets removed and K8s deletes the pod. This will result in a + // Pod does not exist error. This should be retried using the retry policy + logger.Warningf(ctx, "Failed in finalizing get Resource with name: %v. Error: %v", nsName, err) + return err + } + + // This must happen after sending admin event. It's safe against partial failures because if the event failed, we will + // simply retry in the next round. If the event succeeded but this failed, we will try again the next round to send + // the same event (idempotent) and then come here again... + err := clearFinalizers(ctx, o, kubeClient) + if err != nil { + errs.Append(err) + } + } + + // If we should delete the resource when finalize is called, do a best effort delete. + //if k8sPluginCfg.DeleteResourceOnFinalize && !e.plugin.GetProperties().DisableDeleteResourceOnFinalize { + if k8sPluginCfg.DeleteResourceOnFinalize { + // Attempt to delete resource, if not found, return success. + if err := kubeClient.GetClient().Delete(ctx, o); err != nil { + if isK8sObjectNotExists(err) { + return errs.ErrorOrDefault() + } + + // This happens sometimes because a node gets removed and K8s deletes the pod. This will result in a + // Pod does not exist error. This should be retried using the retry policy + logger.Warningf(ctx, "Failed in finalizing. Failed to delete Resource with name: %v. Error: %v", nsName, err) + errs.Append(fmt.Errorf("finalize: failed to delete resource with name [%v]. Error: %w", nsName, err)) + } + } + + return errs.ErrorOrDefault() +} -func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, config *Config, kubeClient pluginsCore.KubeClient, logPlugin tasklog.Plugin) (pluginsCore.PhaseInfo, error) { +// getSubtaskPhaseInfo returns the PhaseInfo describing an existing k8s resource which is defined +// by the SubTaskExecutionContext and Config. +func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Config, kubeClient pluginsCore.KubeClient, logPlugin tasklog.Plugin) (pluginsCore.PhaseInfo, error) { o, err := podPlugin.DefaultPodPlugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) if err != nil { logger.Errorf(ctx, "Failed to build the Resource with name: %v. Error: %v", stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), err) @@ -153,7 +240,7 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con } pod := o.(*v1.Pod) - addMetadata(stCtx, config, pod) + addMetadata(stCtx, cfg, config.GetK8sPluginConfig(), pod) // Attempt to get resource from informer cache, if not found, retrieve it from API server. nsName := k8stypes.NamespacedName{Name: pod.GetName(), Namespace: pod.GetNamespace()} @@ -187,6 +274,7 @@ func getSubtaskPhaseInfo(ctx context.Context, stCtx SubTaskExecutionContext, con return phaseInfo, err } +// getTaskContainerIndex returns the index of the primary container in a k8s pod. func getTaskContainerIndex(pod *v1.Pod) (int, error) { primaryContainerName, ok := pod.Annotations[podPlugin.PrimaryContainerKey] // For tasks with a Container target, we only ever build one container as part of the pod @@ -207,6 +295,7 @@ func getTaskContainerIndex(pod *v1.Pod) (int, error) { return -1, stdErrors.Errorf(ErrBuildPodTemplate, "Couldn't find any container matching the primary container key when building an array job with a K8sPod spec target") } +// isK8sObjectNotExists returns true if the error is one which describes a non existant k8s object. func isK8sObjectNotExists(err error) bool { return k8serrors.IsNotFound(err) || k8serrors.IsGone(err) || k8serrors.IsResourceExpired(err) } From 9a4f4aeb6116dd4f8f58d958143b53226d25c04f Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Thu, 17 Feb 2022 09:46:51 -0600 Subject: [PATCH 14/20] added abort functionality Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/executor.go | 11 ++-- go/tasks/plugins/array/k8s/management.go | 33 ++---------- go/tasks/plugins/array/k8s/subtask.go | 67 ++++++++++++++++++++---- 3 files changed, 68 insertions(+), 43 deletions(-) diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index c90707684..c4f2444d1 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -144,18 +144,21 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c } func (e Executor) Abort(ctx context.Context, tCtx core.TaskExecutionContext) error { - return nil + pluginState := &arrayCore.State{} + if _, err := tCtx.PluginStateReader().Get(pluginState); err != nil { + return errors.Wrapf(errors.CorruptedPluginState, err, "Failed to read unmarshal custom state") + } + + return TerminateSubTasks(ctx, tCtx, e.kubeClient, GetConfig(), abortSubtask, pluginState) } func (e Executor) Finalize(ctx context.Context, tCtx core.TaskExecutionContext) error { - pluginConfig := GetConfig() - pluginState := &arrayCore.State{} if _, err := tCtx.PluginStateReader().Get(pluginState); err != nil { return errors.Wrapf(errors.CorruptedPluginState, err, "Failed to read unmarshal custom state") } - return TerminateSubTasks(ctx, tCtx, e.kubeClient, pluginConfig, pluginState) + return TerminateSubTasks(ctx, tCtx, e.kubeClient, GetConfig(), finalizeSubtask, pluginState) } func (e Executor) Start(ctx context.Context) error { diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index a8b9096e5..6d1dde7de 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -250,9 +250,11 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon return newState, logLinks, subTaskIDs, nil } -// TerminateSubTasks performs operations to gracefully shutdown all subtasks. This may include +// TerminateSubTasks performs operations to gracefully terminate all subtasks. This may include // aborting and finalizing active k8s resources. -func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, currentState *arrayCore.State) error { +func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, + terminateFunction func(context.Context, SubTaskExecutionContext, *Config, core.KubeClient) error, currentState *arrayCore.State) error { + taskTemplate, err := tCtx.TaskReader().Read(ctx) if err != nil { return err @@ -273,7 +275,7 @@ func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kube originalIdx := arrayCore.CalculateOriginalIndex(childIdx, currentState.GetIndexesToCache()) stCtx := newSubTaskExecutionContext(tCtx, taskTemplate, childIdx, originalIdx, retryAttempt) - err := finalizeSubtask(ctx, stCtx, config, kubeClient) + err := terminateFunction(ctx, stCtx, config, kubeClient) if err != nil { messageCollector.Collect(childIdx, err.Error()) } @@ -284,29 +286,4 @@ func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kube } return nil - // TODO - fix - /*size := currentState.GetExecutionArraySize() - messageCollector := errorcollector.NewErrorMessageCollector() - for childIdx := 0; childIdx < size; childIdx++ { - task := Task{ - ChildIdx: childIdx, - Config: config, - State: currentState, - } - - err := task.Abort(ctx, tCtx, kubeClient) - if err != nil { - messageCollector.Collect(childIdx, err.Error()) - } - err = task.Finalize(ctx, tCtx, kubeClient) - if err != nil { - messageCollector.Collect(childIdx, err.Error()) - } - } - - if errs.Length() > 0 { - return fmt.Errorf(errs.Summary(config.MaxErrorStringLength)) - }*/ - - //return nil } diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index 92da35839..4d2adfacd 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -11,6 +11,7 @@ import ( "github.com/flyteorg/flyteplugins/go/tasks/errors" pluginsCore "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/core" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/flytek8s/config" + "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/k8s" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/utils" podPlugin "github.com/flyteorg/flyteplugins/go/tasks/plugins/k8s/pod" @@ -86,10 +87,62 @@ func addMetadata(stCtx SubTaskExecutionContext, cfg *Config, k8sPluginCfg *confi } } -/*func abortSubtask() error { - // TODO +// abortSubtask attempts to interrupt the k8s pod defined by the SubTaskExecutionContext and Config +func abortSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Config, kubeClient pluginsCore.KubeClient) error { + logger.Infof(ctx, "KillTask invoked. We will attempt to delete object [%v].", + stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName()) + + var plugin k8s.Plugin + plugin = podPlugin.DefaultPodPlugin + + o, err := plugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) + if err != nil { + // This will recurrent, so we will skip further finalize + logger.Errorf(ctx, "Failed to build the Resource with name: %v. Error: %v, when finalizing.", stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), err) + return nil + } + + addMetadata(stCtx, cfg, config.GetK8sPluginConfig(), o.(*v1.Pod)) + + deleteResource := true + abortOverride, hasAbortOverride := plugin.(k8s.PluginAbortOverride) + + resourceToFinalize := o + var behavior k8s.AbortBehavior + + if hasAbortOverride { + behavior, err = abortOverride.OnAbort(ctx, stCtx, o) + deleteResource = err == nil && behavior.DeleteResource + if err == nil && behavior.Resource != nil { + resourceToFinalize = behavior.Resource + } + } + + if err != nil { + } else if deleteResource { + err = kubeClient.GetClient().Delete(ctx, resourceToFinalize) + } else { + if behavior.Patch != nil && behavior.Update == nil { + err = kubeClient.GetClient().Patch(ctx, resourceToFinalize, behavior.Patch.Patch, behavior.Patch.Options...) + } else if behavior.Patch == nil && behavior.Update != nil { + err = kubeClient.GetClient().Update(ctx, resourceToFinalize, behavior.Update.Options...) + } else { + err = errors.Errorf(errors.RuntimeFailure, "AbortBehavior for resource %v must specify either a Patch and an Update operation if Delete is set to false. Only one can be supplied.", resourceToFinalize.GetName()) + } + if behavior.DeleteOnErr && err != nil { + logger.Warningf(ctx, "Failed to apply AbortBehavior for resource %v with error %v. Will attempt to delete resource.", resourceToFinalize.GetName(), err) + err = kubeClient.GetClient().Delete(ctx, resourceToFinalize) + } + } + + if err != nil && !isK8sObjectNotExists(err) { + logger.Warningf(ctx, "Failed to clear finalizers for Resource with name: %v/%v. Error: %v", + resourceToFinalize.GetNamespace(), resourceToFinalize.GetName(), err) + return err + } + return nil -}*/ +} // clearFinalizers removes finalizers (if they exist) from the k8s resource func clearFinalizers(ctx context.Context, o client.Object, kubeClient pluginsCore.KubeClient) error { @@ -167,14 +220,6 @@ func finalizeSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Co var nsName k8stypes.NamespacedName k8sPluginCfg := config.GetK8sPluginConfig() if k8sPluginCfg.InjectFinalizer || k8sPluginCfg.DeleteResourceOnFinalize { - /*o, err = e.plugin.BuildIdentityResource(ctx, tCtx.TaskExecutionMetadata()) - if err != nil { - // This will recurrent, so we will skip further finalize - logger.Errorf(ctx, "Failed to build the Resource with name: %v. Error: %v, when finalizing.", tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), err) - return nil - } - - e.AddObjectMetadata(tCtx.TaskExecutionMetadata(), o, config.GetK8sPluginConfig())*/ o, err := podPlugin.DefaultPodPlugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) if err != nil { // This will recurrent, so we will skip further finalize From c17fd5eabfb13b6c965e3717df0165479409358e Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Thu, 17 Feb 2022 09:49:30 -0600 Subject: [PATCH 15/20] fixed lint issues Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/subtask.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index 4d2adfacd..a89ad4fb7 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -92,9 +92,7 @@ func abortSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Confi logger.Infof(ctx, "KillTask invoked. We will attempt to delete object [%v].", stCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName()) - var plugin k8s.Plugin - plugin = podPlugin.DefaultPodPlugin - + var plugin k8s.Plugin = podPlugin.DefaultPodPlugin o, err := plugin.BuildIdentityResource(ctx, stCtx.TaskExecutionMetadata()) if err != nil { // This will recurrent, so we will skip further finalize @@ -340,7 +338,7 @@ func getTaskContainerIndex(pod *v1.Pod) (int, error) { return -1, stdErrors.Errorf(ErrBuildPodTemplate, "Couldn't find any container matching the primary container key when building an array job with a K8sPod spec target") } -// isK8sObjectNotExists returns true if the error is one which describes a non existant k8s object. +// isK8sObjectNotExists returns true if the error is one which describes a non existent k8s object. func isK8sObjectNotExists(err error) bool { return k8serrors.IsNotFound(err) || k8serrors.IsGone(err) || k8serrors.IsResourceExpired(err) } From 9bcf21cada1321d37a238fc4357be8a226ad6204 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Thu, 17 Feb 2022 11:38:27 -0600 Subject: [PATCH 16/20] fixed log suffix to match pod name Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/subtask.go | 1 - .../plugins/array/k8s/subtask_exec_context.go | 22 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index a89ad4fb7..69342c6ea 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -255,7 +255,6 @@ func finalizeSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Co } // If we should delete the resource when finalize is called, do a best effort delete. - //if k8sPluginCfg.DeleteResourceOnFinalize && !e.plugin.GetProperties().DisableDeleteResourceOnFinalize { if k8sPluginCfg.DeleteResourceOnFinalize { // Attempt to delete resource, if not found, return success. if err := kubeClient.GetClient().Delete(ctx, o); err != nil { diff --git a/go/tasks/plugins/array/k8s/subtask_exec_context.go b/go/tasks/plugins/array/k8s/subtask_exec_context.go index 0d97240be..1d2ea1e85 100644 --- a/go/tasks/plugins/array/k8s/subtask_exec_context.go +++ b/go/tasks/plugins/array/k8s/subtask_exec_context.go @@ -54,6 +54,7 @@ func newSubTaskExecutionContext(tCtx pluginsCore.TaskExecutionContext, taskTempl executionIndex, taskExecutionID.GetGeneratedName(), retryAttempt, + taskExecutionID.GetID().RetryAttempt, }, } @@ -94,9 +95,10 @@ func (s SubTaskReader) Read(ctx context.Context) (*core.TaskTemplate, error) { // SubTaskExecutionID wraps the core TaskExecutionID to customize the generated pod name type SubTaskExecutionID struct { pluginsCore.TaskExecutionID - executionIndex int - parentName string - retryAttempt uint64 + executionIndex int + parentName string + subtaskRetryAttempt uint64 + taskRetryAttempt uint32 } // GetGeneratedName overrides the base TaskExecutionID to append the subtask index and retryAttempt @@ -105,20 +107,24 @@ func (s SubTaskExecutionID) GetGeneratedName() string { // If the retryAttempt is 0 we do not include it in the pod name. The gives us backwards // compatibility in the ability to dynamically transition running map tasks to use subtask retries. - if s.retryAttempt == 0 { + if s.subtaskRetryAttempt == 0 { return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v", s.parentName, indexStr)) } - retryAttemptStr := strconv.FormatUint(s.retryAttempt, 10) + retryAttemptStr := strconv.FormatUint(s.subtaskRetryAttempt, 10) return utils.ConvertToDNS1123SubdomainCompatibleString(fmt.Sprintf("%v-%v-%v", s.parentName, indexStr, retryAttemptStr)) } // GetLogSuffix returns the suffix which should be appended to subtask log names func (s SubTaskExecutionID) GetLogSuffix() string { // Append the retry attempt and executionIndex so that log names coincide with pod names per - // https://github.com/flyteorg/flyteplugins/pull/186#discussion_r666569825. Prior to tracking - // subtask retryAttempts the pod name used the map task retry number. We may want to revisit. - return fmt.Sprintf(" #%d-%d", s.retryAttempt, s.executionIndex) + // https://github.com/flyteorg/flyteplugins/pull/186#discussion_r666569825. To maintain + // backwards compatibility we append the subtaskRetryAttempt if it is not 0. + if s.subtaskRetryAttempt == 0 { + return fmt.Sprintf(" #%d-%d", s.taskRetryAttempt, s.executionIndex) + } + + return fmt.Sprintf(" #%d-%d-%d", s.taskRetryAttempt, s.executionIndex, s.subtaskRetryAttempt) } // SubTaskExecutionMetadata wraps the core TaskExecutionMetadata to customize the TaskExecutionID From bd3b357b2e112fcf813840da9edf6a938a52eb97 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Thu, 17 Feb 2022 11:53:16 -0600 Subject: [PATCH 17/20] fixed abort error Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/subtask.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index 69342c6ea..b35bd6296 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -214,7 +214,7 @@ func launchSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Conf // and Config. These may include removing finalizers and deleting the k8s resource. func finalizeSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Config, kubeClient pluginsCore.KubeClient) error { errs := stdErrors.ErrorCollection{} - var o client.Object + var pod *v1.Pod var nsName k8stypes.NamespacedName k8sPluginCfg := config.GetK8sPluginConfig() if k8sPluginCfg.InjectFinalizer || k8sPluginCfg.DeleteResourceOnFinalize { @@ -225,8 +225,10 @@ func finalizeSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Co return nil } - addMetadata(stCtx, cfg, config.GetK8sPluginConfig(), o.(*v1.Pod)) - nsName = k8stypes.NamespacedName{Namespace: o.GetNamespace(), Name: o.GetName()} + pod = o.(*v1.Pod) + + addMetadata(stCtx, cfg, config.GetK8sPluginConfig(), pod) + nsName = k8stypes.NamespacedName{Namespace: pod.GetNamespace(), Name: pod.GetName()} } // In InjectFinalizer is on, it means we may have added the finalizers when we launched this resource. Attempt to @@ -235,7 +237,7 @@ func finalizeSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Co // deleted at this point. Therefore, account for these cases and do not consider them errors. if k8sPluginCfg.InjectFinalizer { // Attempt to get resource from informer cache, if not found, retrieve it from API server. - if err := kubeClient.GetClient().Get(ctx, nsName, o); err != nil { + if err := kubeClient.GetClient().Get(ctx, nsName, pod); err != nil { if isK8sObjectNotExists(err) { return nil } @@ -248,7 +250,7 @@ func finalizeSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Co // This must happen after sending admin event. It's safe against partial failures because if the event failed, we will // simply retry in the next round. If the event succeeded but this failed, we will try again the next round to send // the same event (idempotent) and then come here again... - err := clearFinalizers(ctx, o, kubeClient) + err := clearFinalizers(ctx, pod, kubeClient) if err != nil { errs.Append(err) } @@ -257,7 +259,7 @@ func finalizeSubtask(ctx context.Context, stCtx SubTaskExecutionContext, cfg *Co // If we should delete the resource when finalize is called, do a best effort delete. if k8sPluginCfg.DeleteResourceOnFinalize { // Attempt to delete resource, if not found, return success. - if err := kubeClient.GetClient().Delete(ctx, o); err != nil { + if err := kubeClient.GetClient().Delete(ctx, pod); err != nil { if isK8sObjectNotExists(err) { return errs.ErrorOrDefault() } From 86ed06adf0ebfd6d3d865b62a13eb58e06cb2c17 Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Thu, 17 Feb 2022 12:23:23 -0600 Subject: [PATCH 18/20] finalizing subtask on terminal phase Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index 6d1dde7de..e144b5a3a 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -207,7 +207,11 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon return currentState, logLinks, subTaskIDs, err } - // TODO - finalize resource? + err = finalizeSubtask(ctx, stCtx, config, kubeClient) + if err != nil { + logger.Errorf(ctx, "Error finalizing resource [%s] in Finalize [%s]", podName, err) + return currentState, logLinks, subTaskIDs, err + } } // validate parallelism From 5de13b33c6e7848cf0867a016c006cce0755458d Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Thu, 17 Feb 2022 12:28:16 -0600 Subject: [PATCH 19/20] deallocating resource on launchSubtask fail Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 10 +++++++++- go/tasks/plugins/array/k8s/subtask.go | 3 ++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index e144b5a3a..b4dde86f4 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -163,7 +163,15 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon phaseInfo = core.PhaseInfoWaitingForResourcesInfo(time.Now(), core.DefaultPhaseVersion, "Exceeded ResourceManager quota", nil) } else { phaseInfo, perr = launchSubtask(ctx, stCtx, config, kubeClient) - // TODO if this fails do we need to deallocate the resource to mitigate leaks? + + // if launchSubtask fails we attempt to deallocate the (previously allocated) + // resource to mitigate leaks + if perr != nil { + derr = deallocateResource(ctx, stCtx, config, podName) + if derr != nil { + logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) + } + } } } else { phaseInfo, perr = getSubtaskPhaseInfo(ctx, stCtx, config, kubeClient, logPlugin) diff --git a/go/tasks/plugins/array/k8s/subtask.go b/go/tasks/plugins/array/k8s/subtask.go index b35bd6296..f4e5c12d8 100644 --- a/go/tasks/plugins/array/k8s/subtask.go +++ b/go/tasks/plugins/array/k8s/subtask.go @@ -78,7 +78,8 @@ func addMetadata(stCtx SubTaskExecutionContext, cfg *Config, k8sPluginCfg *confi pod.Spec.SchedulerName = cfg.DefaultScheduler } - // TODO - should these be appends? or left as overrides? + // The legacy map task implemented these as overrides so they were left as such. May want to + // revist whether they would serve better as appends. if len(cfg.NodeSelector) != 0 { pod.Spec.NodeSelector = cfg.NodeSelector } From 7995e1af2e8fbef0c2f976cae111bd19a53754ed Mon Sep 17 00:00:00 2001 From: Daniel Rammer Date: Thu, 17 Feb 2022 14:11:20 -0600 Subject: [PATCH 20/20] fixed management derr issue Signed-off-by: Daniel Rammer --- go/tasks/plugins/array/k8s/management.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/tasks/plugins/array/k8s/management.go b/go/tasks/plugins/array/k8s/management.go index b4dde86f4..e5b7dd2cf 100644 --- a/go/tasks/plugins/array/k8s/management.go +++ b/go/tasks/plugins/array/k8s/management.go @@ -167,8 +167,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon // if launchSubtask fails we attempt to deallocate the (previously allocated) // resource to mitigate leaks if perr != nil { - derr = deallocateResource(ctx, stCtx, config, podName) - if derr != nil { + perr = deallocateResource(ctx, stCtx, config, podName) + if perr != nil { logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) } }