From 28f9aeb68d563492c182f4dccab2a408f2def8fd Mon Sep 17 00:00:00 2001 From: Surinder Singh Date: Fri, 28 Feb 2020 10:57:21 -0800 Subject: [PATCH] Use system failure for node deletion or infra level issues (#59) * Use system failure for node deletion or infra level issues * using version * removing PhaseSystemRetryableFailure * code review feedback * fixing a test * idl version * reverting tests --- go.mod | 2 +- go.sum | 4 ++-- go/tasks/pluginmachinery/core/phase.go | 7 ++++++- go/tasks/pluginmachinery/flytek8s/pod_helper.go | 2 +- go/tasks/plugins/array/k8s/monitor.go | 1 + go/tasks/plugins/hive/execution_state.go | 2 +- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 4fc0ed583..69683399d 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/golang/protobuf v1.3.3 github.com/googleapis/gnostic v0.4.1 // indirect github.com/hashicorp/golang-lru v0.5.4 - github.com/lyft/flyteidl v0.17.1 + github.com/lyft/flyteidl v0.17.5 github.com/lyft/flytestdlib v0.3.2 github.com/magiconair/properties v1.8.1 github.com/mitchellh/mapstructure v1.1.2 diff --git a/go.sum b/go.sum index 21d313983..962cd64f4 100644 --- a/go.sum +++ b/go.sum @@ -296,8 +296,8 @@ github.com/lyft/api v0.0.0-20191031200350-b49a72c274e0 h1:NGL46+1RYcCXb3sShp0nQq github.com/lyft/api v0.0.0-20191031200350-b49a72c274e0/go.mod h1:/L5qH+AD540e7Cetbui1tuJeXdmNhO8jM6VkXeDdDhQ= github.com/lyft/apimachinery v0.0.0-20191031200210-047e3ea32d7f h1:PGuAMDzAen0AulUfaEhNQMYmUpa41pAVo3zHI+GJsCM= github.com/lyft/apimachinery v0.0.0-20191031200210-047e3ea32d7f/go.mod h1:llRdnznGEAqC3DcNm6yEj472xaFVfLM7hnYofMb12tQ= -github.com/lyft/flyteidl v0.17.1 h1:XXi8sTSzPVXG337S1ZbOTi7PHIBgy1sIehhQu1eZpyI= -github.com/lyft/flyteidl v0.17.1/go.mod h1:/zQXxuHO11u/saxTTZc8oYExIGEShXB+xCB1/F1Cu20= +github.com/lyft/flyteidl v0.17.5 h1:nuUixm2glaJ4orKw3t/G0y1iG3ikYUR6FLxQy6NPmNM= +github.com/lyft/flyteidl v0.17.5/go.mod h1:/zQXxuHO11u/saxTTZc8oYExIGEShXB+xCB1/F1Cu20= github.com/lyft/flytestdlib v0.3.0 h1:nIkX4MlyYdcLLzaF35RI2P5BhARt+qMgHoFto8eVNzU= github.com/lyft/flytestdlib v0.3.0/go.mod h1:LJPPJlkFj+wwVWMrQT3K5JZgNhZi2mULsCG4ZYhinhU= github.com/lyft/flytestdlib v0.3.2 h1:bY6Y+Fg6Jdc7zY4GAYuR7t2hjWwynIdmRvtLcRNaGnw= diff --git a/go/tasks/pluginmachinery/core/phase.go b/go/tasks/pluginmachinery/core/phase.go index 78e60e252..059e43925 100644 --- a/go/tasks/pluginmachinery/core/phase.go +++ b/go/tasks/pluginmachinery/core/phase.go @@ -9,6 +9,7 @@ import ( ) const DefaultPhaseVersion = uint32(0) +const SystemErrorCode = "SystemError" //go:generate enumer -type=Phase @@ -184,5 +185,9 @@ func PhaseInfoFailure(code, reason string, info *TaskInfo) PhaseInfo { } func PhaseInfoRetryableFailure(code, reason string, info *TaskInfo) PhaseInfo { - return PhaseInfoFailed(PhaseRetryableFailure, &core.ExecutionError{Code: code, Message: reason}, info) + return PhaseInfoFailed(PhaseRetryableFailure, &core.ExecutionError{Code: code, Message: reason, Kind: core.ExecutionError_USER}, info) +} + +func PhaseInfoSystemRetryableFailure(code, reason string, info *TaskInfo) PhaseInfo { + return PhaseInfoFailed(PhaseRetryableFailure, &core.ExecutionError{Code: code, Message: reason, Kind: core.ExecutionError_SYSTEM}, info) } diff --git a/go/tasks/pluginmachinery/flytek8s/pod_helper.go b/go/tasks/pluginmachinery/flytek8s/pod_helper.go index 07edc4487..c9e342342 100755 --- a/go/tasks/pluginmachinery/flytek8s/pod_helper.go +++ b/go/tasks/pluginmachinery/flytek8s/pod_helper.go @@ -154,7 +154,7 @@ func DemystifyPending(status v1.PodStatus) (pluginsCore.PhaseInfo, error) { // So be default if the container is not waiting with the PodInitializing/ContainerCreating // reasons, then we will assume a failure reason, and fail instantly t := c.LastTransitionTime.Time - return pluginsCore.PhaseInfoRetryableFailure(c.Reason, c.Message, &pluginsCore.TaskInfo{ + return pluginsCore.PhaseInfoSystemRetryableFailure(c.Reason, c.Message, &pluginsCore.TaskInfo{ OccurredAt: &t, }), nil } diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index c9226ffb1..ea632f0e8 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -140,6 +140,7 @@ func CheckPodStatus(ctx context.Context, client core.KubeClient, name k8sTypes.N return core.PhaseInfoFailed(core.PhaseRetryableFailure, &idlCore.ExecutionError{ Code: string(k8serrors.ReasonForError(err)), Message: err.Error(), + Kind: idlCore.ExecutionError_SYSTEM, }, &core.TaskInfo{ OccurredAt: &now, }), nil diff --git a/go/tasks/plugins/hive/execution_state.go b/go/tasks/plugins/hive/execution_state.go index 4246d1293..cf33699a7 100644 --- a/go/tasks/plugins/hive/execution_state.go +++ b/go/tasks/plugins/hive/execution_state.go @@ -105,7 +105,7 @@ func MapExecutionStateToPhaseInfo(state ExecutionState, quboleClient client.Qubo case PhaseQueued: // TODO: Turn into config if state.CreationFailureCount > 5 { - phaseInfo = core.PhaseInfoRetryableFailure("QuboleFailure", "Too many creation attempts", nil) + phaseInfo = core.PhaseInfoSystemRetryableFailure("QuboleFailure", "Too many creation attempts", nil) } else { phaseInfo = core.PhaseInfoQueued(t, uint32(state.CreationFailureCount), "Waiting for Qubole launch") }