diff --git a/agent/dockerclient/dockerapi/errors.go b/agent/dockerclient/dockerapi/errors.go index 8d8f179feb8..ae4261b5a43 100644 --- a/agent/dockerclient/dockerapi/errors.go +++ b/agent/dockerclient/dockerapi/errors.go @@ -50,6 +50,12 @@ func (err *DockerTimeoutError) Error() string { // ErrorName returns the name of the error func (err *DockerTimeoutError) ErrorName() string { return DockerTimeoutErrorName } +// IsRetriableError returns a boolean indicating whether the call that +// generated the error can be retried. +func (err DockerTimeoutError) IsRetriableError() bool { + return true +} + // OutOfMemoryError is a type for errors caused by running out of memory type OutOfMemoryError struct{} diff --git a/agent/engine/docker_task_engine.go b/agent/engine/docker_task_engine.go index fd284b88b2d..284bc83552a 100644 --- a/agent/engine/docker_task_engine.go +++ b/agent/engine/docker_task_engine.go @@ -25,6 +25,9 @@ import ( "sync" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/aws/amazon-ecs-agent/agent/api" apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container" apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status" @@ -102,11 +105,11 @@ const ( defaultMonitorExecAgentsInterval = 15 * time.Minute - stopContainerBackoffMin = time.Second - stopContainerBackoffMax = time.Second * 5 + defaultStopContainerBackoffMin = time.Second + defaultStopContainerBackoffMax = time.Second * 5 stopContainerBackoffJitter = 0.2 stopContainerBackoffMultiplier = 1.3 - stopContainerMaxRetryCount = 3 + stopContainerMaxRetryCount = 5 ) var newExponentialBackoff = retry.NewExponentialBackoff @@ -176,6 +179,8 @@ type DockerTaskEngine struct { monitorExecAgentsTicker *time.Ticker execCmdMgr execcmd.Manager monitorExecAgentsInterval time.Duration + stopContainerBackoffMin time.Duration + stopContainerBackoffMax time.Duration } // NewDockerTaskEngine returns a created, but uninitialized, DockerTaskEngine. @@ -214,6 +219,8 @@ func NewDockerTaskEngine(cfg *config.Config, handleDelay: time.Sleep, execCmdMgr: execCmdMgr, monitorExecAgentsInterval: defaultMonitorExecAgentsInterval, + stopContainerBackoffMin: defaultStopContainerBackoffMin, + stopContainerBackoffMax: defaultStopContainerBackoffMax, } dockerTaskEngine.initializeContainerStatusToTransitionFunction() @@ -1552,19 +1559,28 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api // for more information, see: https://github.com/moby/moby/issues/41587 func (engine *DockerTaskEngine) stopDockerContainer(dockerID, containerName string, apiTimeoutStopContainer time.Duration) dockerapi.DockerContainerMetadata { var md dockerapi.DockerContainerMetadata - backoff := newExponentialBackoff(stopContainerBackoffMin, stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier) + backoff := newExponentialBackoff(engine.stopContainerBackoffMin, engine.stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier) for i := 0; i < stopContainerMaxRetryCount; i++ { md = engine.client.StopContainer(engine.ctx, dockerID, apiTimeoutStopContainer) - if md.Error == nil || md.Error.ErrorName() != dockerapi.DockerTimeoutErrorName { + if md.Error == nil { + return md + } + cannotStopContainerError, ok := md.Error.(cannotStopContainerError) + if ok && !cannotStopContainerError.IsRetriableError() { return md } + if i < stopContainerMaxRetryCount-1 { - time.Sleep(backoff.Duration()) + retryIn := backoff.Duration() + logger.Warn(fmt.Sprintf("Error stopping container, retrying in %v", retryIn), logger.Fields{ + field.Container: containerName, + field.RuntimeID: dockerID, + field.Error: md.Error, + "attempt": i + 1, + }) + time.Sleep(retryIn) } } - if md.Error != nil && md.Error.ErrorName() == dockerapi.DockerTimeoutErrorName { - seelog.Warnf("Unable to stop container (%s) after %d attempts that timed out; giving up and marking container as stopped anyways", containerName, stopContainerMaxRetryCount) - } return md } diff --git a/agent/engine/docker_task_engine_test.go b/agent/engine/docker_task_engine_test.go index 65daa2820ad..1d7c863e5fa 100644 --- a/agent/engine/docker_task_engine_test.go +++ b/agent/engine/docker_task_engine_test.go @@ -175,7 +175,8 @@ func mocks(t *testing.T, ctx context.Context, cfg *config.Config) (*gomock.Contr imageManager, dockerstate.NewTaskEngineState(), metadataManager, nil, execCmdMgr) taskEngine.(*DockerTaskEngine)._time = mockTime taskEngine.(*DockerTaskEngine).ctx = ctx - + taskEngine.(*DockerTaskEngine).stopContainerBackoffMin = time.Millisecond + taskEngine.(*DockerTaskEngine).stopContainerBackoffMax = time.Millisecond * 2 return ctrl, client, mockTime, taskEngine, credentialsManager, imageManager, metadataManager } @@ -920,7 +921,10 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) { // Validate that timeouts are retried exactly 3 times client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()). Return(containerStopTimeoutError). - Times(3), + Times(5), + + client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}). + Times(1), ) } @@ -991,7 +995,10 @@ func TestTaskTransitionWhenStopContainerReturnsUnretriableError(t *testing.T) { // event. client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(dockerapi.DockerContainerMetadata{ Error: dockerapi.CannotStopContainerError{dockerapi.NoSuchContainerError{}}, - }).MinTimes(1), + }).MaxTimes(1), + client.EXPECT().SystemPing(gomock.Any(), gomock.Any()). + Return(dockerapi.PingResponse{}). + Times(1), ) } @@ -1042,7 +1049,7 @@ func TestTaskTransitionWhenStopContainerReturnsTransientErrorBeforeSucceeding(t client.EXPECT().StartContainer(gomock.Any(), containerID, defaultConfig.ContainerStartTimeout).Return( dockerapi.DockerContainerMetadata{DockerID: containerID}), // StopContainer errors out a couple of times - client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(2), + client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(4), // Since task is not in steady state, progressContainers causes // another invocation of StopContainer. Return the 'succeed' response, // which should cause the task engine to stop invoking this again and diff --git a/agent/engine/task_manager.go b/agent/engine/task_manager.go index c425a38efba..fe764938960 100644 --- a/agent/engine/task_manager.go +++ b/agent/engine/task_manager.go @@ -51,6 +51,7 @@ const ( // credentials from acs, after the timeout it will check the credentials manager // and start processing the task or start another round of waiting waitForPullCredentialsTimeout = 1 * time.Minute + systemPingTimeout = 5 * time.Second defaultTaskSteadyStatePollInterval = 5 * time.Minute defaultTaskSteadyStatePollIntervalJitter = 30 * time.Second transitionPollTime = 5 * time.Second @@ -132,6 +133,7 @@ type managedTask struct { cfg *config.Config credentialsManager credentials.Manager cniClient ecscni.CNIClient + dockerClient dockerapi.DockerClient taskStopWG *utilsync.SequentialWaitGroup acsMessages chan acsTransition @@ -180,6 +182,7 @@ func (engine *DockerTaskEngine) newManagedTask(task *apitask.Task) *managedTask containerChangeEventStream: engine.containerChangeEventStream, credentialsManager: engine.credentialsManager, cniClient: engine.cniClient, + dockerClient: engine.client, taskStopWG: engine.taskStopGroup, steadyStatePollInterval: engine.taskSteadyStatePollInterval, steadyStatePollIntervalJitter: engine.taskSteadyStatePollIntervalJitter, @@ -930,16 +933,16 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi.DockerContainerChangeEvent, container *apicontainer.Container, currentKnownStatus apicontainerstatus.ContainerStatus) bool { - // If docker returned a transient error while trying to stop a container, - // reset the known status to the current status and return - cannotStopContainerError, ok := event.Error.(cannotStopContainerError) - if ok && cannotStopContainerError.IsRetriableError() { - logger.Info("Error stopping the container; ignoring state change", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Container: container.Name, - field.RuntimeID: container.GetRuntimeID(), - "ErrorName": event.Error.ErrorName(), - field.Error: cannotStopContainerError.Error(), + + pr := mtask.dockerClient.SystemPing(mtask.ctx, systemPingTimeout) + if pr.Error != nil { + logger.Info("Error stopping the container, but docker seems to be unresponsive; ignoring state change", logger.Fields{ + field.TaskARN: mtask.Arn, + field.Container: container.Name, + field.RuntimeID: container.GetRuntimeID(), + "ErrorName": event.Error.ErrorName(), + field.Error: event.Error.Error(), + "SystemPingError": pr.Error, }) container.SetKnownStatus(currentKnownStatus) return false diff --git a/agent/engine/task_manager_test.go b/agent/engine/task_manager_test.go index 9a6a37779ff..b22958fcf9c 100644 --- a/agent/engine/task_manager_test.go +++ b/agent/engine/task_manager_test.go @@ -84,8 +84,8 @@ func TestHandleEventError(t *testing.T) { FromError: errors.New(""), }, ExpectedContainerKnownStatusSet: true, - ExpectedContainerKnownStatus: apicontainerstatus.ContainerRunning, - ExpectedOK: false, + ExpectedContainerKnownStatus: apicontainerstatus.ContainerStopped, + ExpectedOK: true, }, { Name: "Unretriable error with Stop", @@ -187,6 +187,15 @@ func TestHandleEventError(t *testing.T) { for _, tc := range testCases { t.Run(tc.Name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + client := mock_dockerapi.NewMockDockerClient(ctrl) + + if tc.EventStatus == apicontainerstatus.ContainerStopped { + client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}). + Times(1) + } + container := &apicontainer.Container{ KnownStatusUnsafe: tc.CurrentContainerKnownStatus, } @@ -203,8 +212,9 @@ func TestHandleEventError(t *testing.T) { Task: &apitask.Task{ Arn: "task1", }, - engine: &DockerTaskEngine{}, - cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior}, + engine: &DockerTaskEngine{}, + cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior}, + dockerClient: client, } ok := mtask.handleEventError(containerChange, tc.CurrentContainerKnownStatus) assert.Equal(t, tc.ExpectedOK, ok, "to proceed")