Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop container bug fix #2885

Merged
merged 3 commits into from
Jun 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion agent/dockerclient/dockerapi/docker_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ type DockerClient interface {
// should be provided for the request.
ListContainers(context.Context, bool, time.Duration) ListContainersResponse

// SystemPing returns the Ping response from Docker's SystemPing API
SystemPing(context.Context, time.Duration) PingResponse

// ListImages returns the set of the images known to the Docker daemon
ListImages(context.Context, time.Duration) ListImagesResponse

Expand Down Expand Up @@ -750,7 +753,11 @@ func (dg *dockerGoClient) stopContainer(ctx context.Context, dockerID string, ti
metadata := dg.containerMetadata(ctx, dockerID)
if err != nil {
seelog.Errorf("DockerGoClient: error stopping container ID=%s: %v", dockerID, err)
if metadata.Error == nil {
if metadata.Error != nil {
// Wrap metadata.Error in CannotStopContainerError in order to make the whole stopContainer operation
// retryable.
metadata.Error = CannotStopContainerError{metadata.Error}
} else {
if strings.Contains(err.Error(), "No such container") {
err = NoSuchContainerError{dockerID}
}
Expand Down Expand Up @@ -1099,6 +1106,42 @@ func (dg *dockerGoClient) listImages(ctx context.Context) ListImagesResponse {
return ListImagesResponse{ImageIDs: imageIDs, RepoTags: imageRepoTags, Error: nil}
}

func (dg *dockerGoClient) SystemPing(ctx context.Context, timeout time.Duration) PingResponse {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()

// Buffered channel so in the case of timeout it takes one write, never gets
// read, and can still be GC'd
response := make(chan PingResponse, 1)
go func() { response <- dg.systemPing(ctx) }()
select {
case resp := <-response:
return resp
case <-ctx.Done():
// Context has either expired or canceled. If it has timed out,
// send back the DockerTimeoutError
err := ctx.Err()
if err == context.DeadlineExceeded {
return PingResponse{Error: &DockerTimeoutError{timeout, "listing"}}
}
return PingResponse{Error: err}
}
}

func (dg *dockerGoClient) systemPing(ctx context.Context) PingResponse {
client, err := dg.sdkDockerClient()
if err != nil {
return PingResponse{Error: err}
}

pingResponse, err := client.Ping(ctx)
if err != nil {
return PingResponse{Error: err}
}

return PingResponse{Response: &pingResponse}
}

func (dg *dockerGoClient) SupportedVersions() []dockerclient.DockerVersion {
return dg.sdkClientFactory.FindSupportedAPIVersions()
}
Expand Down
28 changes: 28 additions & 0 deletions agent/dockerclient/dockerapi/docker_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1008,6 +1008,34 @@ func TestDockerVersion(t *testing.T) {
assert.Equal(t, "1.6.0", str, "Got unexpected version string: "+str)
}

func TestSystemPing(t *testing.T) {
mockDockerSDK, client, _, _, _, done := dockerClientSetup(t)
defer done()

mockDockerSDK.EXPECT().Ping(gomock.Any()).Return(types.Ping{APIVersion: "test_docker_api"}, nil)

ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
pingResponse := client.SystemPing(ctx, dockerclient.InfoTimeout)

assert.NoError(t, pingResponse.Error)
assert.Equal(t, "test_docker_api", pingResponse.Response.APIVersion)
}

func TestSystemPingError(t *testing.T) {
mockDockerSDK, client, _, _, _, done := dockerClientSetup(t)
defer done()

mockDockerSDK.EXPECT().Ping(gomock.Any()).Return(types.Ping{}, errors.New("test error"))

ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
pingResponse := client.SystemPing(ctx, dockerclient.InfoTimeout)

assert.Error(t, pingResponse.Error)
assert.Nil(t, pingResponse.Response)
}

func TestDockerInfo(t *testing.T) {
mockDockerSDK, client, _, _, _, done := dockerClientSetup(t)
defer done()
Expand Down
6 changes: 6 additions & 0 deletions agent/dockerclient/dockerapi/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ func (err *DockerTimeoutError) Error() string {
// ErrorName returns the name of the error
func (err *DockerTimeoutError) ErrorName() string { return DockerTimeoutErrorName }

// IsRetriableError returns a boolean indicating whether the call that
// generated the error can be retried.
func (err DockerTimeoutError) IsRetriableError() bool {
return true
}

// OutOfMemoryError is a type for errors caused by running out of memory
type OutOfMemoryError struct{}

Expand Down
14 changes: 14 additions & 0 deletions agent/dockerclient/dockerapi/mocks/dockerapi_mocks.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions agent/dockerclient/dockerapi/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ type ListImagesResponse struct {
Error error
}

type PingResponse struct {
Response *types.Ping
Error error
}

// VolumeResponse wrapper for CreateVolume and InspectVolume
// TODO Remove type when migration is complete
type VolumeResponse struct {
Expand Down
34 changes: 25 additions & 9 deletions agent/engine/docker_task_engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ import (
"sync"
"time"

"github.com/aws/amazon-ecs-agent/agent/logger"
"github.com/aws/amazon-ecs-agent/agent/logger/field"

"github.com/aws/amazon-ecs-agent/agent/api"
apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container"
apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status"
Expand Down Expand Up @@ -102,11 +105,11 @@ const (

defaultMonitorExecAgentsInterval = 15 * time.Minute

stopContainerBackoffMin = time.Second
stopContainerBackoffMax = time.Second * 5
defaultStopContainerBackoffMin = time.Second
defaultStopContainerBackoffMax = time.Second * 5
stopContainerBackoffJitter = 0.2
stopContainerBackoffMultiplier = 1.3
stopContainerMaxRetryCount = 3
stopContainerMaxRetryCount = 5
)

var newExponentialBackoff = retry.NewExponentialBackoff
Expand Down Expand Up @@ -176,6 +179,8 @@ type DockerTaskEngine struct {
monitorExecAgentsTicker *time.Ticker
execCmdMgr execcmd.Manager
monitorExecAgentsInterval time.Duration
stopContainerBackoffMin time.Duration
stopContainerBackoffMax time.Duration
}

// NewDockerTaskEngine returns a created, but uninitialized, DockerTaskEngine.
Expand Down Expand Up @@ -214,6 +219,8 @@ func NewDockerTaskEngine(cfg *config.Config,
handleDelay: time.Sleep,
execCmdMgr: execCmdMgr,
monitorExecAgentsInterval: defaultMonitorExecAgentsInterval,
stopContainerBackoffMin: defaultStopContainerBackoffMin,
stopContainerBackoffMax: defaultStopContainerBackoffMax,
}

dockerTaskEngine.initializeContainerStatusToTransitionFunction()
Expand Down Expand Up @@ -1552,19 +1559,28 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api
// for more information, see: https://github.com/moby/moby/issues/41587
func (engine *DockerTaskEngine) stopDockerContainer(dockerID, containerName string, apiTimeoutStopContainer time.Duration) dockerapi.DockerContainerMetadata {
var md dockerapi.DockerContainerMetadata
backoff := newExponentialBackoff(stopContainerBackoffMin, stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier)
backoff := newExponentialBackoff(engine.stopContainerBackoffMin, engine.stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier)
for i := 0; i < stopContainerMaxRetryCount; i++ {
md = engine.client.StopContainer(engine.ctx, dockerID, apiTimeoutStopContainer)
if md.Error == nil || md.Error.ErrorName() != dockerapi.DockerTimeoutErrorName {
if md.Error == nil {
return md
}
cannotStopContainerError, ok := md.Error.(cannotStopContainerError)
if ok && !cannotStopContainerError.IsRetriableError() {
return md
}

if i < stopContainerMaxRetryCount-1 {
time.Sleep(backoff.Duration())
retryIn := backoff.Duration()
logger.Warn(fmt.Sprintf("Error stopping container, retrying in %v", retryIn), logger.Fields{
field.Container: containerName,
field.RuntimeID: dockerID,
field.Error: md.Error,
"attempt": i + 1,
})
time.Sleep(retryIn)
}
}
if md.Error != nil && md.Error.ErrorName() == dockerapi.DockerTimeoutErrorName {
seelog.Warnf("Unable to stop container (%s) after %d attempts that timed out; giving up and marking container as stopped anyways", containerName, stopContainerMaxRetryCount)
}
return md
}

Expand Down
15 changes: 11 additions & 4 deletions agent/engine/docker_task_engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ func mocks(t *testing.T, ctx context.Context, cfg *config.Config) (*gomock.Contr
imageManager, dockerstate.NewTaskEngineState(), metadataManager, nil, execCmdMgr)
taskEngine.(*DockerTaskEngine)._time = mockTime
taskEngine.(*DockerTaskEngine).ctx = ctx

taskEngine.(*DockerTaskEngine).stopContainerBackoffMin = time.Millisecond
taskEngine.(*DockerTaskEngine).stopContainerBackoffMax = time.Millisecond * 2
return ctrl, client, mockTime, taskEngine, credentialsManager, imageManager, metadataManager
}

Expand Down Expand Up @@ -920,7 +921,10 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) {
// Validate that timeouts are retried exactly 3 times
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).
Return(containerStopTimeoutError).
Times(3),
Times(5),

client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}).
Times(1),
)
}

Expand Down Expand Up @@ -991,7 +995,10 @@ func TestTaskTransitionWhenStopContainerReturnsUnretriableError(t *testing.T) {
// event.
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(dockerapi.DockerContainerMetadata{
Error: dockerapi.CannotStopContainerError{dockerapi.NoSuchContainerError{}},
}).MinTimes(1),
}).MaxTimes(1),
client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).
Return(dockerapi.PingResponse{}).
Times(1),
)
}

Expand Down Expand Up @@ -1042,7 +1049,7 @@ func TestTaskTransitionWhenStopContainerReturnsTransientErrorBeforeSucceeding(t
client.EXPECT().StartContainer(gomock.Any(), containerID, defaultConfig.ContainerStartTimeout).Return(
dockerapi.DockerContainerMetadata{DockerID: containerID}),
// StopContainer errors out a couple of times
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(2),
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(4),
// Since task is not in steady state, progressContainers causes
// another invocation of StopContainer. Return the 'succeed' response,
// which should cause the task engine to stop invoking this again and
Expand Down
23 changes: 13 additions & 10 deletions agent/engine/task_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ const (
// credentials from acs, after the timeout it will check the credentials manager
// and start processing the task or start another round of waiting
waitForPullCredentialsTimeout = 1 * time.Minute
systemPingTimeout = 5 * time.Second
defaultTaskSteadyStatePollInterval = 5 * time.Minute
defaultTaskSteadyStatePollIntervalJitter = 30 * time.Second
transitionPollTime = 5 * time.Second
Expand Down Expand Up @@ -132,6 +133,7 @@ type managedTask struct {
cfg *config.Config
credentialsManager credentials.Manager
cniClient ecscni.CNIClient
dockerClient dockerapi.DockerClient
taskStopWG *utilsync.SequentialWaitGroup

acsMessages chan acsTransition
Expand Down Expand Up @@ -180,6 +182,7 @@ func (engine *DockerTaskEngine) newManagedTask(task *apitask.Task) *managedTask
containerChangeEventStream: engine.containerChangeEventStream,
credentialsManager: engine.credentialsManager,
cniClient: engine.cniClient,
dockerClient: engine.client,
taskStopWG: engine.taskStopGroup,
steadyStatePollInterval: engine.taskSteadyStatePollInterval,
steadyStatePollIntervalJitter: engine.taskSteadyStatePollIntervalJitter,
Expand Down Expand Up @@ -930,16 +933,16 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange
func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi.DockerContainerChangeEvent,
container *apicontainer.Container,
currentKnownStatus apicontainerstatus.ContainerStatus) bool {
// If docker returned a transient error while trying to stop a container,
// reset the known status to the current status and return
cannotStopContainerError, ok := event.Error.(cannotStopContainerError)
if ok && cannotStopContainerError.IsRetriableError() {
logger.Info("Error stopping the container; ignoring state change", logger.Fields{
field.TaskARN: mtask.Arn,
field.Container: container.Name,
field.RuntimeID: container.GetRuntimeID(),
"ErrorName": event.Error.ErrorName(),
field.Error: cannotStopContainerError.Error(),

pr := mtask.dockerClient.SystemPing(mtask.ctx, systemPingTimeout)
if pr.Error != nil {
logger.Info("Error stopping the container, but docker seems to be unresponsive; ignoring state change", logger.Fields{
field.TaskARN: mtask.Arn,
field.Container: container.Name,
field.RuntimeID: container.GetRuntimeID(),
"ErrorName": event.Error.ErrorName(),
field.Error: event.Error.Error(),
"SystemPingError": pr.Error,
})
container.SetKnownStatus(currentKnownStatus)
return false
Expand Down
18 changes: 14 additions & 4 deletions agent/engine/task_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ func TestHandleEventError(t *testing.T) {
FromError: errors.New(""),
},
ExpectedContainerKnownStatusSet: true,
ExpectedContainerKnownStatus: apicontainerstatus.ContainerRunning,
ExpectedOK: false,
ExpectedContainerKnownStatus: apicontainerstatus.ContainerStopped,
ExpectedOK: true,
},
{
Name: "Unretriable error with Stop",
Expand Down Expand Up @@ -187,6 +187,15 @@ func TestHandleEventError(t *testing.T) {

for _, tc := range testCases {
t.Run(tc.Name, func(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
client := mock_dockerapi.NewMockDockerClient(ctrl)

if tc.EventStatus == apicontainerstatus.ContainerStopped {
client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}).
Times(1)
}

container := &apicontainer.Container{
KnownStatusUnsafe: tc.CurrentContainerKnownStatus,
}
Expand All @@ -203,8 +212,9 @@ func TestHandleEventError(t *testing.T) {
Task: &apitask.Task{
Arn: "task1",
},
engine: &DockerTaskEngine{},
cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior},
engine: &DockerTaskEngine{},
cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior},
dockerClient: client,
}
ok := mtask.handleEventError(containerChange, tc.CurrentContainerKnownStatus)
assert.Equal(t, tc.ExpectedOK, ok, "to proceed")
Expand Down