Skip to content

Commit

Permalink
stop container timeout: increase buffer from 30s to 2m
Browse files Browse the repository at this point in the history
  • Loading branch information
sparrc committed Oct 29, 2020
1 parent 9280bcf commit c0dbcd6
Show file tree
Hide file tree
Showing 8 changed files with 22 additions and 18 deletions.
4 changes: 2 additions & 2 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ const (
// This is only used when PollMetrics is set to true
DefaultPollingMetricsWaitDuration = DefaultContainerMetricsPublishInterval / 2

// defaultDockerStopTimeout specifies the value for container stop timeout duration
defaultDockerStopTimeout = 30 * time.Second
// DefaultDockerStopTimeout specifies the value for container stop timeout duration
DefaultDockerStopTimeout = 30 * time.Second

// DefaultImageCleanupTimeInterval specifies the default value for image cleanup duration. It is used to
// remove the images pulled by agent.
Expand Down
4 changes: 2 additions & 2 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,15 +293,15 @@ func TestInvalidFormatDockerStopTimeout(t *testing.T) {
defer setTestEnv("ECS_CONTAINER_STOP_TIMEOUT", "invalid")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, conf.DockerStopTimeout, defaultDockerStopTimeout, "Wrong value for DockerStopTimeout")
assert.Equal(t, conf.DockerStopTimeout, DefaultDockerStopTimeout, "Wrong value for DockerStopTimeout")
}

func TestZeroValueDockerStopTimeout(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_CONTAINER_STOP_TIMEOUT", "0s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, defaultDockerStopTimeout, conf.DockerStopTimeout, "Wrong value for DockerStopTimeout")
assert.Equal(t, DefaultDockerStopTimeout, conf.DockerStopTimeout, "Wrong value for DockerStopTimeout")
}

func TestInvalidValueDockerStopTimeout(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func DefaultConfig() Config {
ReservedMemory: 0,
AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver},
TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration,
DockerStopTimeout: defaultDockerStopTimeout,
DockerStopTimeout: DefaultDockerStopTimeout,
ContainerStartTimeout: defaultContainerStartTimeout,
CredentialsAuditLogFile: defaultCredentialsAuditLogFile,
CredentialsAuditLogDisabled: false,
Expand Down
2 changes: 1 addition & 1 deletion agent/config/config_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func DefaultConfig() Config {
ReservedMemory: 0,
AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver, dockerclient.AWSLogsDriver},
TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration,
DockerStopTimeout: defaultDockerStopTimeout,
DockerStopTimeout: DefaultDockerStopTimeout,
ContainerStartTimeout: defaultContainerStartTimeout,
ImagePullInactivityTimeout: defaultImagePullInactivityTimeout,
ImagePullTimeout: DefaultImagePullTimeout,
Expand Down
12 changes: 7 additions & 5 deletions agent/dockerclient/dockerapi/docker_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ const (
)

// Timelimits for docker operations enforced above docker
// TODO: Make these limits configurable.
const (
// Parameters for caching the docker auth for ECR
tokenCacheSize = 100
Expand All @@ -90,7 +89,12 @@ const (
pollStatsTimeout = 18 * time.Second
)

var ctxTimeoutStopContainer = dockerclient.StopContainerTimeout
// stopContainerTimeoutBuffer is a buffer added to the timeout passed into the docker
// StopContainer api call. The reason for this buffer is that when the regular "stop"
// command fails, the docker api falls back to other kill methods, such as a containerd
// kill and SIGKILL. This buffer adds time onto the context timeout to allow time
// for these backup kill methods to finish.
var stopContainerTimeoutBuffer = 2 * time.Minute

type inactivityTimeoutHandlerFunc func(reader io.ReadCloser, timeout time.Duration, cancelRequest func(), canceled *uint32) (io.ReadCloser, chan<- struct{})

Expand Down Expand Up @@ -659,9 +663,7 @@ func (dg *dockerGoClient) inspectContainer(ctx context.Context, dockerID string)
}

func (dg *dockerGoClient) StopContainer(ctx context.Context, dockerID string, timeout time.Duration) DockerContainerMetadata {
// ctxTimeout is sum of timeout(applied to the StopContainer api call) and a fixed constant dockerclient.StopContainerTimeout
// the context's timeout should be greater than the sigkill timout for the StopContainer call
ctxTimeout := timeout + ctxTimeoutStopContainer
ctxTimeout := timeout + stopContainerTimeoutBuffer
ctx, cancel := context.WithTimeout(ctx, ctxTimeout)
defer cancel()
defer metrics.MetricsEngineGlobal.RecordDockerMetric("STOP_CONTAINER")()
Expand Down
10 changes: 7 additions & 3 deletions agent/dockerclient/dockerapi/docker_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,11 @@ func TestStopContainerTimeout(t *testing.T) {
cfg.DockerStopTimeout = xContainerShortTimeout
mockDockerSDK, client, _, _, _, done := dockerClientSetupWithConfig(t, cfg)
defer done()
ctxTimeoutStopContainer = xContainerShortTimeout
reset := stopContainerTimeoutBuffer
stopContainerTimeoutBuffer = xContainerShortTimeout
defer func() {
stopContainerTimeoutBuffer = reset
}()

wait := &sync.WaitGroup{}
wait.Add(1)
Expand All @@ -488,7 +492,7 @@ func TestStopContainerTimeout(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
metadata := client.StopContainer(ctx, "id", xContainerShortTimeout)
assert.Error(t, metadata.Error, "Expected error for pull timeout")
assert.Error(t, metadata.Error, "Expected error for stop timeout")
assert.Equal(t, "DockerTimeoutError", metadata.Error.(apierrors.NamedError).ErrorName())
wait.Done()
}
Expand All @@ -514,7 +518,7 @@ func TestStopContainer(t *testing.T) {
)
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()
metadata := client.StopContainer(ctx, "id", dockerclient.StopContainerTimeout)
metadata := client.StopContainer(ctx, "id", client.config.DockerStopTimeout)
assert.NoError(t, metadata.Error)
assert.Equal(t, "id", metadata.DockerID)
}
Expand Down
2 changes: 0 additions & 2 deletions agent/dockerclient/timeout.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ const (
ListContainersTimeout = 10 * time.Minute
// InspectContainerTimeout is the timeout for the InspectContainer API.
InspectContainerTimeout = 30 * time.Second
// StopContainerTimeout is the timeout for the StopContainer API.
StopContainerTimeout = 30 * time.Second
// RemoveContainerTimeout is the timeout for the RemoveContainer API.
RemoveContainerTimeout = 5 * time.Minute

Expand Down
4 changes: 2 additions & 2 deletions agent/engine/docker_task_engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ func TestSteadyStatePoll(t *testing.T) {
dockerapi.DockerContainerMetadata{
DockerID: containerID,
}).AnyTimes()
client.EXPECT().StopContainer(gomock.Any(), containerID, dockerclient.StopContainerTimeout).AnyTimes()
client.EXPECT().StopContainer(gomock.Any(), containerID, config.DefaultDockerStopTimeout).AnyTimes()

err := taskEngine.Init(ctx) // start the task engine
assert.NoError(t, err)
Expand Down Expand Up @@ -869,7 +869,7 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) {
containerStopTimeoutError := dockerapi.DockerContainerMetadata{
Error: &dockerapi.DockerTimeoutError{
Transition: "stop",
Duration: dockerclient.StopContainerTimeout,
Duration: config.DefaultDockerStopTimeout,
},
}
dockerEventSent := make(chan int)
Expand Down

0 comments on commit c0dbcd6

Please sign in to comment.