Skip to content

Commit

Permalink
Retry StopContainer for a limited number of times
Browse files Browse the repository at this point in the history
  • Loading branch information
angelcar committed May 27, 2021
1 parent d30da28 commit 952f401
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 27 deletions.
6 changes: 6 additions & 0 deletions agent/dockerclient/dockerapi/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ func (err *DockerTimeoutError) Error() string {
// ErrorName returns the name of the error
func (err *DockerTimeoutError) ErrorName() string { return DockerTimeoutErrorName }

// IsRetriableError returns a boolean indicating whether the call that
// generated the error can be retried.
func (err DockerTimeoutError) IsRetriableError() bool {
return true
}

// OutOfMemoryError is a type for errors caused by running out of memory
type OutOfMemoryError struct{}

Expand Down
34 changes: 25 additions & 9 deletions agent/engine/docker_task_engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ import (
"sync"
"time"

"github.com/aws/amazon-ecs-agent/agent/logger"
"github.com/aws/amazon-ecs-agent/agent/logger/field"

"github.com/aws/amazon-ecs-agent/agent/api"
apicontainer "github.com/aws/amazon-ecs-agent/agent/api/container"
apicontainerstatus "github.com/aws/amazon-ecs-agent/agent/api/container/status"
Expand Down Expand Up @@ -102,11 +105,11 @@ const (

defaultMonitorExecAgentsInterval = 15 * time.Minute

stopContainerBackoffMin = time.Second
stopContainerBackoffMax = time.Second * 5
defaultStopContainerBackoffMin = time.Second
defaultStopContainerBackoffMax = time.Second * 5
stopContainerBackoffJitter = 0.2
stopContainerBackoffMultiplier = 1.3
stopContainerMaxRetryCount = 3
stopContainerMaxRetryCount = 5
)

var newExponentialBackoff = retry.NewExponentialBackoff
Expand Down Expand Up @@ -176,6 +179,8 @@ type DockerTaskEngine struct {
monitorExecAgentsTicker *time.Ticker
execCmdMgr execcmd.Manager
monitorExecAgentsInterval time.Duration
stopContainerBackoffMin time.Duration
stopContainerBackoffMax time.Duration
}

// NewDockerTaskEngine returns a created, but uninitialized, DockerTaskEngine.
Expand Down Expand Up @@ -214,6 +219,8 @@ func NewDockerTaskEngine(cfg *config.Config,
handleDelay: time.Sleep,
execCmdMgr: execCmdMgr,
monitorExecAgentsInterval: defaultMonitorExecAgentsInterval,
stopContainerBackoffMin: defaultStopContainerBackoffMin,
stopContainerBackoffMax: defaultStopContainerBackoffMax,
}

dockerTaskEngine.initializeContainerStatusToTransitionFunction()
Expand Down Expand Up @@ -1552,19 +1559,28 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api
// for more information, see: https://github.com/moby/moby/issues/41587
func (engine *DockerTaskEngine) stopDockerContainer(dockerID, containerName string, apiTimeoutStopContainer time.Duration) dockerapi.DockerContainerMetadata {
var md dockerapi.DockerContainerMetadata
backoff := newExponentialBackoff(stopContainerBackoffMin, stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier)
backoff := newExponentialBackoff(engine.stopContainerBackoffMin, engine.stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier)
for i := 0; i < stopContainerMaxRetryCount; i++ {
md = engine.client.StopContainer(engine.ctx, dockerID, apiTimeoutStopContainer)
if md.Error == nil || md.Error.ErrorName() != dockerapi.DockerTimeoutErrorName {
if md.Error == nil {
return md
}
cannotStopContainerError, ok := md.Error.(cannotStopContainerError)
if ok && !cannotStopContainerError.IsRetriableError() {
return md
}

if i < stopContainerMaxRetryCount-1 {
time.Sleep(backoff.Duration())
retryIn := backoff.Duration()
logger.Warn(fmt.Sprintf("Error stopping container, retrying in %v", retryIn), logger.Fields{
field.Container: containerName,
field.RuntimeID: dockerID,
field.Error: md.Error,
"attempt": i + 1,
})
time.Sleep(retryIn)
}
}
if md.Error != nil && md.Error.ErrorName() == dockerapi.DockerTimeoutErrorName {
seelog.Warnf("Unable to stop container (%s) after %d attempts that timed out; giving up and marking container as stopped anyways", containerName, stopContainerMaxRetryCount)
}
return md
}

Expand Down
15 changes: 11 additions & 4 deletions agent/engine/docker_task_engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ func mocks(t *testing.T, ctx context.Context, cfg *config.Config) (*gomock.Contr
imageManager, dockerstate.NewTaskEngineState(), metadataManager, nil, execCmdMgr)
taskEngine.(*DockerTaskEngine)._time = mockTime
taskEngine.(*DockerTaskEngine).ctx = ctx

taskEngine.(*DockerTaskEngine).stopContainerBackoffMin = time.Millisecond
taskEngine.(*DockerTaskEngine).stopContainerBackoffMax = time.Millisecond * 2
return ctrl, client, mockTime, taskEngine, credentialsManager, imageManager, metadataManager
}

Expand Down Expand Up @@ -920,7 +921,10 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) {
// Validate that timeouts are retried exactly 3 times
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).
Return(containerStopTimeoutError).
Times(3),
Times(5),

client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}).
Times(1),
)
}

Expand Down Expand Up @@ -991,7 +995,10 @@ func TestTaskTransitionWhenStopContainerReturnsUnretriableError(t *testing.T) {
// event.
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(dockerapi.DockerContainerMetadata{
Error: dockerapi.CannotStopContainerError{dockerapi.NoSuchContainerError{}},
}).MinTimes(1),
}).MaxTimes(1),
client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).
Return(dockerapi.PingResponse{}).
Times(1),
)
}

Expand Down Expand Up @@ -1042,7 +1049,7 @@ func TestTaskTransitionWhenStopContainerReturnsTransientErrorBeforeSucceeding(t
client.EXPECT().StartContainer(gomock.Any(), containerID, defaultConfig.ContainerStartTimeout).Return(
dockerapi.DockerContainerMetadata{DockerID: containerID}),
// StopContainer errors out a couple of times
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(2),
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(4),
// Since task is not in steady state, progressContainers causes
// another invocation of StopContainer. Return the 'succeed' response,
// which should cause the task engine to stop invoking this again and
Expand Down
23 changes: 13 additions & 10 deletions agent/engine/task_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ const (
// credentials from acs, after the timeout it will check the credentials manager
// and start processing the task or start another round of waiting
waitForPullCredentialsTimeout = 1 * time.Minute
systemPingTimeout = 5 * time.Second
defaultTaskSteadyStatePollInterval = 5 * time.Minute
defaultTaskSteadyStatePollIntervalJitter = 30 * time.Second
transitionPollTime = 5 * time.Second
Expand Down Expand Up @@ -132,6 +133,7 @@ type managedTask struct {
cfg *config.Config
credentialsManager credentials.Manager
cniClient ecscni.CNIClient
dockerClient dockerapi.DockerClient
taskStopWG *utilsync.SequentialWaitGroup

acsMessages chan acsTransition
Expand Down Expand Up @@ -180,6 +182,7 @@ func (engine *DockerTaskEngine) newManagedTask(task *apitask.Task) *managedTask
containerChangeEventStream: engine.containerChangeEventStream,
credentialsManager: engine.credentialsManager,
cniClient: engine.cniClient,
dockerClient: engine.client,
taskStopWG: engine.taskStopGroup,
steadyStatePollInterval: engine.taskSteadyStatePollInterval,
steadyStatePollIntervalJitter: engine.taskSteadyStatePollIntervalJitter,
Expand Down Expand Up @@ -930,16 +933,16 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange
func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi.DockerContainerChangeEvent,
container *apicontainer.Container,
currentKnownStatus apicontainerstatus.ContainerStatus) bool {
// If docker returned a transient error while trying to stop a container,
// reset the known status to the current status and return
cannotStopContainerError, ok := event.Error.(cannotStopContainerError)
if ok && cannotStopContainerError.IsRetriableError() {
logger.Info("Error stopping the container; ignoring state change", logger.Fields{
field.TaskARN: mtask.Arn,
field.Container: container.Name,
field.RuntimeID: container.GetRuntimeID(),
"ErrorName": event.Error.ErrorName(),
field.Error: cannotStopContainerError.Error(),

pr := mtask.dockerClient.SystemPing(mtask.ctx, systemPingTimeout)
if pr.Error != nil {
logger.Info("Error stopping the container, but docker seems to be unresponsive; ignoring state change", logger.Fields{
field.TaskARN: mtask.Arn,
field.Container: container.Name,
field.RuntimeID: container.GetRuntimeID(),
"ErrorName": event.Error.ErrorName(),
field.Error: event.Error.Error(),
"SystemPingError": pr.Error,
})
container.SetKnownStatus(currentKnownStatus)
return false
Expand Down
18 changes: 14 additions & 4 deletions agent/engine/task_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ func TestHandleEventError(t *testing.T) {
FromError: errors.New(""),
},
ExpectedContainerKnownStatusSet: true,
ExpectedContainerKnownStatus: apicontainerstatus.ContainerRunning,
ExpectedOK: false,
ExpectedContainerKnownStatus: apicontainerstatus.ContainerStopped,
ExpectedOK: true,
},
{
Name: "Unretriable error with Stop",
Expand Down Expand Up @@ -187,6 +187,15 @@ func TestHandleEventError(t *testing.T) {

for _, tc := range testCases {
t.Run(tc.Name, func(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
client := mock_dockerapi.NewMockDockerClient(ctrl)

if tc.EventStatus == apicontainerstatus.ContainerStopped {
client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}).
Times(1)
}

container := &apicontainer.Container{
KnownStatusUnsafe: tc.CurrentContainerKnownStatus,
}
Expand All @@ -203,8 +212,9 @@ func TestHandleEventError(t *testing.T) {
Task: &apitask.Task{
Arn: "task1",
},
engine: &DockerTaskEngine{},
cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior},
engine: &DockerTaskEngine{},
cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior},
dockerClient: client,
}
ok := mtask.handleEventError(containerChange, tc.CurrentContainerKnownStatus)
assert.Equal(t, tc.ExpectedOK, ok, "to proceed")
Expand Down

0 comments on commit 952f401

Please sign in to comment.