diff --git a/agent/dockerclient/dockerapi/docker_client.go b/agent/dockerclient/dockerapi/docker_client.go index 4935d7d8b7c..e93d83c24b8 100644 --- a/agent/dockerclient/dockerapi/docker_client.go +++ b/agent/dockerclient/dockerapi/docker_client.go @@ -162,6 +162,9 @@ type DockerClient interface { // should be provided for the request. ListContainers(context.Context, bool, time.Duration) ListContainersResponse + // SystemPing returns the Ping response from Docker's SystemPing API + SystemPing(context.Context, time.Duration) PingResponse + // ListImages returns the set of the images known to the Docker daemon ListImages(context.Context, time.Duration) ListImagesResponse @@ -750,7 +753,11 @@ func (dg *dockerGoClient) stopContainer(ctx context.Context, dockerID string, ti metadata := dg.containerMetadata(ctx, dockerID) if err != nil { seelog.Errorf("DockerGoClient: error stopping container ID=%s: %v", dockerID, err) - if metadata.Error == nil { + if metadata.Error != nil { + // Wrap metadata.Error in CannotStopContainerError in order to make the whole stopContainer operation + // retryable. + metadata.Error = CannotStopContainerError{metadata.Error} + } else { if strings.Contains(err.Error(), "No such container") { err = NoSuchContainerError{dockerID} } @@ -1099,6 +1106,42 @@ func (dg *dockerGoClient) listImages(ctx context.Context) ListImagesResponse { return ListImagesResponse{ImageIDs: imageIDs, RepoTags: imageRepoTags, Error: nil} } +func (dg *dockerGoClient) SystemPing(ctx context.Context, timeout time.Duration) PingResponse { + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + // Buffered channel so in the case of timeout it takes one write, never gets + // read, and can still be GC'd + response := make(chan PingResponse, 1) + go func() { response <- dg.systemPing(ctx) }() + select { + case resp := <-response: + return resp + case <-ctx.Done(): + // Context has either expired or canceled. If it has timed out, + // send back the DockerTimeoutError + err := ctx.Err() + if err == context.DeadlineExceeded { + return PingResponse{Error: &DockerTimeoutError{timeout, "listing"}} + } + return PingResponse{Error: err} + } +} + +func (dg *dockerGoClient) systemPing(ctx context.Context) PingResponse { + client, err := dg.sdkDockerClient() + if err != nil { + return PingResponse{Error: err} + } + + pingResponse, err := client.Ping(ctx) + if err != nil { + return PingResponse{Error: err} + } + + return PingResponse{Response: &pingResponse} +} + func (dg *dockerGoClient) SupportedVersions() []dockerclient.DockerVersion { return dg.sdkClientFactory.FindSupportedAPIVersions() } diff --git a/agent/dockerclient/dockerapi/docker_client_test.go b/agent/dockerclient/dockerapi/docker_client_test.go index 7436eac7f36..5dd24f73d60 100644 --- a/agent/dockerclient/dockerapi/docker_client_test.go +++ b/agent/dockerclient/dockerapi/docker_client_test.go @@ -1008,6 +1008,34 @@ func TestDockerVersion(t *testing.T) { assert.Equal(t, "1.6.0", str, "Got unexpected version string: "+str) } +func TestSystemPing(t *testing.T) { + mockDockerSDK, client, _, _, _, done := dockerClientSetup(t) + defer done() + + mockDockerSDK.EXPECT().Ping(gomock.Any()).Return(types.Ping{APIVersion: "test_docker_api"}, nil) + + ctx, cancel := context.WithCancel(context.TODO()) + defer cancel() + pingResponse := client.SystemPing(ctx, dockerclient.InfoTimeout) + + assert.NoError(t, pingResponse.Error) + assert.Equal(t, "test_docker_api", pingResponse.Response.APIVersion) +} + +func TestSystemPingError(t *testing.T) { + mockDockerSDK, client, _, _, _, done := dockerClientSetup(t) + defer done() + + mockDockerSDK.EXPECT().Ping(gomock.Any()).Return(types.Ping{}, errors.New("test error")) + + ctx, cancel := context.WithCancel(context.TODO()) + defer cancel() + pingResponse := client.SystemPing(ctx, dockerclient.InfoTimeout) + + assert.Error(t, pingResponse.Error) + assert.Nil(t, pingResponse.Response) +} + func TestDockerInfo(t *testing.T) { mockDockerSDK, client, _, _, _, done := dockerClientSetup(t) defer done() diff --git a/agent/dockerclient/dockerapi/errors.go b/agent/dockerclient/dockerapi/errors.go index 8d8f179feb8..ae4261b5a43 100644 --- a/agent/dockerclient/dockerapi/errors.go +++ b/agent/dockerclient/dockerapi/errors.go @@ -50,6 +50,12 @@ func (err *DockerTimeoutError) Error() string { // ErrorName returns the name of the error func (err *DockerTimeoutError) ErrorName() string { return DockerTimeoutErrorName } +// IsRetriableError returns a boolean indicating whether the call that +// generated the error can be retried. +func (err DockerTimeoutError) IsRetriableError() bool { + return true +} + // OutOfMemoryError is a type for errors caused by running out of memory type OutOfMemoryError struct{} diff --git a/agent/dockerclient/dockerapi/mocks/dockerapi_mocks.go b/agent/dockerclient/dockerapi/mocks/dockerapi_mocks.go index 63ff0bb7ddd..a2abcd7d81f 100644 --- a/agent/dockerclient/dockerapi/mocks/dockerapi_mocks.go +++ b/agent/dockerclient/dockerapi/mocks/dockerapi_mocks.go @@ -431,6 +431,20 @@ func (mr *MockDockerClientMockRecorder) SupportedVersions() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SupportedVersions", reflect.TypeOf((*MockDockerClient)(nil).SupportedVersions)) } +// SystemPing mocks base method +func (m *MockDockerClient) SystemPing(arg0 context.Context, arg1 time.Duration) dockerapi.PingResponse { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SystemPing", arg0, arg1) + ret0, _ := ret[0].(dockerapi.PingResponse) + return ret0 +} + +// SystemPing indicates an expected call of SystemPing +func (mr *MockDockerClientMockRecorder) SystemPing(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SystemPing", reflect.TypeOf((*MockDockerClient)(nil).SystemPing), arg0, arg1) +} + // TopContainer mocks base method func (m *MockDockerClient) TopContainer(arg0 context.Context, arg1 string, arg2 time.Duration, arg3 ...string) (*container0.ContainerTopOKBody, error) { m.ctrl.T.Helper() diff --git a/agent/dockerclient/dockerapi/types.go b/agent/dockerclient/dockerapi/types.go index b490dab9a52..d368e3a6459 100644 --- a/agent/dockerclient/dockerapi/types.go +++ b/agent/dockerclient/dockerapi/types.go @@ -97,6 +97,11 @@ type ListImagesResponse struct { Error error } +type PingResponse struct { + Response *types.Ping + Error error +} + // VolumeResponse wrapper for CreateVolume and InspectVolume // TODO Remove type when migration is complete type VolumeResponse struct { diff --git a/agent/engine/docker_task_engine.go b/agent/engine/docker_task_engine.go index fd284b88b2d..284bc83552a 100644 --- a/agent/engine/docker_task_engine.go +++ b/agent/engine/docker_task_engine.go @@ -25,6 +25,9 @@ import ( "sync" "time" + "github.com/aws/amazon-ecs-agent/agent/logger" + "github.com/aws/amazon-ecs-agent/agent/logger/field" + "github.com/aws/amazon-ecs-agent/agent/api" apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container" apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status" @@ -102,11 +105,11 @@ const ( defaultMonitorExecAgentsInterval = 15 * time.Minute - stopContainerBackoffMin = time.Second - stopContainerBackoffMax = time.Second * 5 + defaultStopContainerBackoffMin = time.Second + defaultStopContainerBackoffMax = time.Second * 5 stopContainerBackoffJitter = 0.2 stopContainerBackoffMultiplier = 1.3 - stopContainerMaxRetryCount = 3 + stopContainerMaxRetryCount = 5 ) var newExponentialBackoff = retry.NewExponentialBackoff @@ -176,6 +179,8 @@ type DockerTaskEngine struct { monitorExecAgentsTicker *time.Ticker execCmdMgr execcmd.Manager monitorExecAgentsInterval time.Duration + stopContainerBackoffMin time.Duration + stopContainerBackoffMax time.Duration } // NewDockerTaskEngine returns a created, but uninitialized, DockerTaskEngine. @@ -214,6 +219,8 @@ func NewDockerTaskEngine(cfg *config.Config, handleDelay: time.Sleep, execCmdMgr: execCmdMgr, monitorExecAgentsInterval: defaultMonitorExecAgentsInterval, + stopContainerBackoffMin: defaultStopContainerBackoffMin, + stopContainerBackoffMax: defaultStopContainerBackoffMax, } dockerTaskEngine.initializeContainerStatusToTransitionFunction() @@ -1552,19 +1559,28 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api // for more information, see: https://github.com/moby/moby/issues/41587 func (engine *DockerTaskEngine) stopDockerContainer(dockerID, containerName string, apiTimeoutStopContainer time.Duration) dockerapi.DockerContainerMetadata { var md dockerapi.DockerContainerMetadata - backoff := newExponentialBackoff(stopContainerBackoffMin, stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier) + backoff := newExponentialBackoff(engine.stopContainerBackoffMin, engine.stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier) for i := 0; i < stopContainerMaxRetryCount; i++ { md = engine.client.StopContainer(engine.ctx, dockerID, apiTimeoutStopContainer) - if md.Error == nil || md.Error.ErrorName() != dockerapi.DockerTimeoutErrorName { + if md.Error == nil { + return md + } + cannotStopContainerError, ok := md.Error.(cannotStopContainerError) + if ok && !cannotStopContainerError.IsRetriableError() { return md } + if i < stopContainerMaxRetryCount-1 { - time.Sleep(backoff.Duration()) + retryIn := backoff.Duration() + logger.Warn(fmt.Sprintf("Error stopping container, retrying in %v", retryIn), logger.Fields{ + field.Container: containerName, + field.RuntimeID: dockerID, + field.Error: md.Error, + "attempt": i + 1, + }) + time.Sleep(retryIn) } } - if md.Error != nil && md.Error.ErrorName() == dockerapi.DockerTimeoutErrorName { - seelog.Warnf("Unable to stop container (%s) after %d attempts that timed out; giving up and marking container as stopped anyways", containerName, stopContainerMaxRetryCount) - } return md } diff --git a/agent/engine/docker_task_engine_test.go b/agent/engine/docker_task_engine_test.go index 65daa2820ad..1d7c863e5fa 100644 --- a/agent/engine/docker_task_engine_test.go +++ b/agent/engine/docker_task_engine_test.go @@ -175,7 +175,8 @@ func mocks(t *testing.T, ctx context.Context, cfg *config.Config) (*gomock.Contr imageManager, dockerstate.NewTaskEngineState(), metadataManager, nil, execCmdMgr) taskEngine.(*DockerTaskEngine)._time = mockTime taskEngine.(*DockerTaskEngine).ctx = ctx - + taskEngine.(*DockerTaskEngine).stopContainerBackoffMin = time.Millisecond + taskEngine.(*DockerTaskEngine).stopContainerBackoffMax = time.Millisecond * 2 return ctrl, client, mockTime, taskEngine, credentialsManager, imageManager, metadataManager } @@ -920,7 +921,10 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) { // Validate that timeouts are retried exactly 3 times client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()). Return(containerStopTimeoutError). - Times(3), + Times(5), + + client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}). + Times(1), ) } @@ -991,7 +995,10 @@ func TestTaskTransitionWhenStopContainerReturnsUnretriableError(t *testing.T) { // event. client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(dockerapi.DockerContainerMetadata{ Error: dockerapi.CannotStopContainerError{dockerapi.NoSuchContainerError{}}, - }).MinTimes(1), + }).MaxTimes(1), + client.EXPECT().SystemPing(gomock.Any(), gomock.Any()). + Return(dockerapi.PingResponse{}). + Times(1), ) } @@ -1042,7 +1049,7 @@ func TestTaskTransitionWhenStopContainerReturnsTransientErrorBeforeSucceeding(t client.EXPECT().StartContainer(gomock.Any(), containerID, defaultConfig.ContainerStartTimeout).Return( dockerapi.DockerContainerMetadata{DockerID: containerID}), // StopContainer errors out a couple of times - client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(2), + client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(4), // Since task is not in steady state, progressContainers causes // another invocation of StopContainer. Return the 'succeed' response, // which should cause the task engine to stop invoking this again and diff --git a/agent/engine/task_manager.go b/agent/engine/task_manager.go index c425a38efba..fe764938960 100644 --- a/agent/engine/task_manager.go +++ b/agent/engine/task_manager.go @@ -51,6 +51,7 @@ const ( // credentials from acs, after the timeout it will check the credentials manager // and start processing the task or start another round of waiting waitForPullCredentialsTimeout = 1 * time.Minute + systemPingTimeout = 5 * time.Second defaultTaskSteadyStatePollInterval = 5 * time.Minute defaultTaskSteadyStatePollIntervalJitter = 30 * time.Second transitionPollTime = 5 * time.Second @@ -132,6 +133,7 @@ type managedTask struct { cfg *config.Config credentialsManager credentials.Manager cniClient ecscni.CNIClient + dockerClient dockerapi.DockerClient taskStopWG *utilsync.SequentialWaitGroup acsMessages chan acsTransition @@ -180,6 +182,7 @@ func (engine *DockerTaskEngine) newManagedTask(task *apitask.Task) *managedTask containerChangeEventStream: engine.containerChangeEventStream, credentialsManager: engine.credentialsManager, cniClient: engine.cniClient, + dockerClient: engine.client, taskStopWG: engine.taskStopGroup, steadyStatePollInterval: engine.taskSteadyStatePollInterval, steadyStatePollIntervalJitter: engine.taskSteadyStatePollIntervalJitter, @@ -930,16 +933,16 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi.DockerContainerChangeEvent, container *apicontainer.Container, currentKnownStatus apicontainerstatus.ContainerStatus) bool { - // If docker returned a transient error while trying to stop a container, - // reset the known status to the current status and return - cannotStopContainerError, ok := event.Error.(cannotStopContainerError) - if ok && cannotStopContainerError.IsRetriableError() { - logger.Info("Error stopping the container; ignoring state change", logger.Fields{ - field.TaskARN: mtask.Arn, - field.Container: container.Name, - field.RuntimeID: container.GetRuntimeID(), - "ErrorName": event.Error.ErrorName(), - field.Error: cannotStopContainerError.Error(), + + pr := mtask.dockerClient.SystemPing(mtask.ctx, systemPingTimeout) + if pr.Error != nil { + logger.Info("Error stopping the container, but docker seems to be unresponsive; ignoring state change", logger.Fields{ + field.TaskARN: mtask.Arn, + field.Container: container.Name, + field.RuntimeID: container.GetRuntimeID(), + "ErrorName": event.Error.ErrorName(), + field.Error: event.Error.Error(), + "SystemPingError": pr.Error, }) container.SetKnownStatus(currentKnownStatus) return false diff --git a/agent/engine/task_manager_test.go b/agent/engine/task_manager_test.go index 9a6a37779ff..b22958fcf9c 100644 --- a/agent/engine/task_manager_test.go +++ b/agent/engine/task_manager_test.go @@ -84,8 +84,8 @@ func TestHandleEventError(t *testing.T) { FromError: errors.New(""), }, ExpectedContainerKnownStatusSet: true, - ExpectedContainerKnownStatus: apicontainerstatus.ContainerRunning, - ExpectedOK: false, + ExpectedContainerKnownStatus: apicontainerstatus.ContainerStopped, + ExpectedOK: true, }, { Name: "Unretriable error with Stop", @@ -187,6 +187,15 @@ func TestHandleEventError(t *testing.T) { for _, tc := range testCases { t.Run(tc.Name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + client := mock_dockerapi.NewMockDockerClient(ctrl) + + if tc.EventStatus == apicontainerstatus.ContainerStopped { + client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}). + Times(1) + } + container := &apicontainer.Container{ KnownStatusUnsafe: tc.CurrentContainerKnownStatus, } @@ -203,8 +212,9 @@ func TestHandleEventError(t *testing.T) { Task: &apitask.Task{ Arn: "task1", }, - engine: &DockerTaskEngine{}, - cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior}, + engine: &DockerTaskEngine{}, + cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior}, + dockerClient: client, } ok := mtask.handleEventError(containerChange, tc.CurrentContainerKnownStatus) assert.Equal(t, tc.ExpectedOK, ok, "to proceed")