diff --git a/agent/acs/model/api/api-2.json b/agent/acs/model/api/api-2.json index 4e99a8fdddc..0c96d646e18 100644 --- a/agent/acs/model/api/api-2.json +++ b/agent/acs/model/api/api-2.json @@ -204,7 +204,9 @@ "registryAuthentication":{"shape":"RegistryAuthenticationData"}, "logsAuthStrategy":{"shape":"AuthStrategy"}, "secrets":{"shape":"SecretList"}, - "dependsOn":{"shape": "DependsOnList"} + "dependsOn":{"shape": "DependsOnList"}, + "startTimeout":{"shape":"Integer"}, + "stopTimeout":{"shape":"Integer"} } }, "ContainerList":{ @@ -613,4 +615,4 @@ ] } } -} \ No newline at end of file +} diff --git a/agent/acs/model/ecsacs/api.go b/agent/acs/model/ecsacs/api.go index 2c5f3f98693..4ae7360d5da 100644 --- a/agent/acs/model/ecsacs/api.go +++ b/agent/acs/model/ecsacs/api.go @@ -238,6 +238,10 @@ type Container struct { Secrets []*Secret `locationName:"secrets" type:"list"` + StartTimeout *int64 `locationName:"startTimeout" type:"integer"` + + StopTimeout *int64 `locationName:"stopTimeout" type:"integer"` + VolumesFrom []*VolumeFrom `locationName:"volumesFrom" type:"list"` } diff --git a/agent/api/container/container.go b/agent/api/container/container.go index 25e450df245..50f19488372 100644 --- a/agent/api/container/container.go +++ b/agent/api/container/container.go @@ -140,6 +140,12 @@ type Container struct { // LogsAuthStrategy specifies how the logs driver for the container will be // authenticated LogsAuthStrategy string + // StartTimeout specifies the time the agent waits for StartContainer api call + // before timing out + StartTimeout uint + // StopTimeout specifies the time value to be passed as StopContainer api call + StopTimeout uint + // lock is used for fields that are accessed and updated concurrently lock sync.RWMutex @@ -867,3 +873,17 @@ func (c *Container) HasSecretAsEnv() bool { } return false } + +func (c *Container) GetStartTimeout() time.Duration { + c.lock.Lock() + defer c.lock.Unlock() + + return time.Duration(c.StartTimeout) * time.Second +} + +func (c *Container) GetStopTimeout() time.Duration { + c.lock.Lock() + defer c.lock.Unlock() + + return time.Duration(c.StopTimeout) * time.Second +} diff --git a/agent/api/container/container_test.go b/agent/api/container/container_test.go index 921e09ebe19..abd93c1c116 100644 --- a/agent/api/container/container_test.go +++ b/agent/api/container/container_test.go @@ -327,7 +327,7 @@ func TestMergeEnvironmentVariables(t *testing.T) { }, { - Name: "merge single item to nil container env var map", + Name: "merge single item to nil container env var map", InContainerEnvironment: nil, InEnvVarMap: map[string]string{ "SECRET1": "secret1"}, @@ -357,7 +357,7 @@ func TestMergeEnvironmentVariables(t *testing.T) { }, { - Name: "merge nil to nil container env var map", + Name: "merge nil to nil container env var map", InContainerEnvironment: nil, InEnvVarMap: nil, OutEnvVarMap: map[string]string{}, @@ -455,3 +455,16 @@ func TestHasSecretAsEnv(t *testing.T) { } } + +func TestPerContainerTimeouts(t *testing.T) { + timeout := uint(10) + expectedTimeout := time.Duration(timeout) * time.Second + + container := Container{ + StartTimeout: timeout, + StopTimeout: timeout, + } + + assert.Equal(t, container.GetStartTimeout(), expectedTimeout) + assert.Equal(t, container.GetStopTimeout(), expectedTimeout) +} diff --git a/agent/api/task/task_test.go b/agent/api/task/task_test.go index 491fa50218a..6cfb5c9dd4f 100644 --- a/agent/api/task/task_test.go +++ b/agent/api/task/task_test.go @@ -2845,28 +2845,27 @@ func TestTaskGPUDisabled(t *testing.T) { }, }, } - assert.False(t, testTask.isGPUEnabled()) } func TestInitializeContainerOrderingWithLinksAndVolumesFrom(t *testing.T) { containerWithOnlyVolume := &apicontainer.Container{ - Name: "myName", - Image: "image:tag", + Name: "myName", + Image: "image:tag", VolumesFrom: []apicontainer.VolumeFrom{{SourceContainer: "myName1"}}, } containerWithOnlyLink := &apicontainer.Container{ Name: "myName1", Image: "image:tag", - Links: []string{"myName"}, + Links: []string{"myName"}, } containerWithBothVolumeAndLink := &apicontainer.Container{ - Name: "myName2", - Image: "image:tag", + Name: "myName2", + Image: "image:tag", VolumesFrom: []apicontainer.VolumeFrom{{SourceContainer: "myName"}}, - Links: []string{"myName1"}, + Links: []string{"myName1"}, } containerWithNoVolumeOrLink := &apicontainer.Container{ @@ -2877,8 +2876,8 @@ func TestInitializeContainerOrderingWithLinksAndVolumesFrom(t *testing.T) { task := &Task{ Arn: "test", ResourcesMapUnsafe: make(map[string][]taskresource.TaskResource), - Containers: []*apicontainer.Container{containerWithOnlyVolume, containerWithOnlyLink, - containerWithBothVolumeAndLink, containerWithNoVolumeOrLink}, + Containers: []*apicontainer.Container{containerWithOnlyVolume, containerWithOnlyLink, + containerWithBothVolumeAndLink, containerWithNoVolumeOrLink}, } err := task.initializeContainerOrderingForVolumes() @@ -2887,27 +2886,27 @@ func TestInitializeContainerOrderingWithLinksAndVolumesFrom(t *testing.T) { assert.NoError(t, err) containerResultWithVolume := task.Containers[0] - assert.Equal(t, "myName1", containerResultWithVolume.DependsOn[0].Container) - assert.Equal(t, ContainerOrderingStartCondition, containerResultWithVolume.DependsOn[0].Condition) + assert.Equal(t, "myName1", containerResultWithVolume.DependsOn[0].Container) + assert.Equal(t, ContainerOrderingStartCondition, containerResultWithVolume.DependsOn[0].Condition) containerResultWithLink := task.Containers[1] - assert.Equal(t, "myName", containerResultWithLink.DependsOn[0].Container) - assert.Equal(t, ContainerOrderingRunningCondition, containerResultWithLink.DependsOn[0].Condition) + assert.Equal(t, "myName", containerResultWithLink.DependsOn[0].Container) + assert.Equal(t, ContainerOrderingRunningCondition, containerResultWithLink.DependsOn[0].Condition) containerResultWithBothVolumeAndLink := task.Containers[2] - assert.Equal(t, "myName", containerResultWithBothVolumeAndLink.DependsOn[0].Container) - assert.Equal(t, ContainerOrderingStartCondition, containerResultWithBothVolumeAndLink.DependsOn[0].Condition) - assert.Equal(t, "myName1", containerResultWithBothVolumeAndLink.DependsOn[1].Container) - assert.Equal(t, ContainerOrderingRunningCondition, containerResultWithBothVolumeAndLink.DependsOn[1].Condition) + assert.Equal(t, "myName", containerResultWithBothVolumeAndLink.DependsOn[0].Container) + assert.Equal(t, ContainerOrderingStartCondition, containerResultWithBothVolumeAndLink.DependsOn[0].Condition) + assert.Equal(t, "myName1", containerResultWithBothVolumeAndLink.DependsOn[1].Container) + assert.Equal(t, ContainerOrderingRunningCondition, containerResultWithBothVolumeAndLink.DependsOn[1].Condition) containerResultWithNoVolumeOrLink := task.Containers[3] - assert.Equal(t, 0, len(containerResultWithNoVolumeOrLink.DependsOn)) + assert.Equal(t, 0, len(containerResultWithNoVolumeOrLink.DependsOn)) } func TestInitializeContainerOrderingWithError(t *testing.T) { containerWithVolumeError := &apicontainer.Container{ - Name: "myName", - Image: "image:tag", + Name: "myName", + Image: "image:tag", VolumesFrom: []apicontainer.VolumeFrom{{SourceContainer: "dummyContainer"}}, } @@ -2945,3 +2944,23 @@ func TestInitializeContainerOrderingWithError(t *testing.T) { errLink2 := task2.initializeContainerOrderingForLinks() assert.Error(t, errLink2) } + +func TestPostUnmarshalPerContainerTimeouts(t *testing.T) { + modelTimeout := int64(10) + expectedTimeout := uint(modelTimeout) + + taskFromACS := ecsacs.Task{ + Containers: []*ecsacs.Container{ + { + StartTimeout: aws.Int64(modelTimeout), + StopTimeout: aws.Int64(modelTimeout), + }, + }, + } + seqNum := int64(42) + task, err := TaskFromACS(&taskFromACS, &ecsacs.PayloadMessage{SeqNum: &seqNum}) + assert.Nil(t, err, "Should be able to handle acs task") + + assert.Equal(t, task.Containers[0].StartTimeout, expectedTimeout) + assert.Equal(t, task.Containers[0].StopTimeout, expectedTimeout) +} diff --git a/agent/dockerclient/dockerapi/docker_client.go b/agent/dockerclient/dockerapi/docker_client.go index 2eb0a4c17ed..6c8641b73db 100644 --- a/agent/dockerclient/dockerapi/docker_client.go +++ b/agent/dockerclient/dockerapi/docker_client.go @@ -91,6 +91,8 @@ const ( pullRetryJitterMultiplier = 0.2 ) +var ctxTimeoutStopContainer = dockerclient.StopContainerTimeout + // DockerClient interface to make testing it easier type DockerClient interface { // SupportedVersions returns a slice of the supported docker versions (or at least supposedly supported). @@ -550,8 +552,8 @@ func (dg *dockerGoClient) createContainer(ctx context.Context, return dg.containerMetadata(ctx, dockerContainer.ID) } -func (dg *dockerGoClient) StartContainer(ctx context.Context, id string, timeout time.Duration) DockerContainerMetadata { - ctx, cancel := context.WithTimeout(ctx, timeout) +func (dg *dockerGoClient) StartContainer(ctx context.Context, id string, ctxTimeout time.Duration) DockerContainerMetadata { + ctx, cancel := context.WithTimeout(ctx, ctxTimeout) defer cancel() defer metrics.MetricsEngineGlobal.RecordDockerMetric("START_CONTAINER")() // Buffered channel so in the case of timeout it takes one write, never gets @@ -566,7 +568,7 @@ func (dg *dockerGoClient) StartContainer(ctx context.Context, id string, timeout // send back the DockerTimeoutError err := ctx.Err() if err == context.DeadlineExceeded { - return DockerContainerMetadata{Error: &DockerTimeoutError{timeout, "started"}} + return DockerContainerMetadata{Error: &DockerTimeoutError{ctxTimeout, "started"}} } return DockerContainerMetadata{Error: CannotStartContainerError{err}} } @@ -654,13 +656,16 @@ func (dg *dockerGoClient) inspectContainer(ctx context.Context, dockerID string) } func (dg *dockerGoClient) StopContainer(ctx context.Context, dockerID string, timeout time.Duration) DockerContainerMetadata { - ctx, cancel := context.WithTimeout(ctx, timeout) + // ctxTimeout is sum of timeout(applied to the StopContainer api call) and a fixed constant dockerclient.StopContainerTimeout + // the context's timeout should be greater than the sigkill timout for the StopContainer call + ctxTimeout := timeout + ctxTimeoutStopContainer + ctx, cancel := context.WithTimeout(ctx, ctxTimeout) defer cancel() defer metrics.MetricsEngineGlobal.RecordDockerMetric("STOP_CONTAINER")() // Buffered channel so in the case of timeout it takes one write, never gets // read, and can still be GC'd response := make(chan DockerContainerMetadata, 1) - go func() { response <- dg.stopContainer(ctx, dockerID) }() + go func() { response <- dg.stopContainer(ctx, dockerID, timeout) }() select { case resp := <-response: return resp @@ -669,19 +674,18 @@ func (dg *dockerGoClient) StopContainer(ctx context.Context, dockerID string, ti // send back the DockerTimeoutError err := ctx.Err() if err == context.DeadlineExceeded { - return DockerContainerMetadata{Error: &DockerTimeoutError{timeout, "stopped"}} + return DockerContainerMetadata{Error: &DockerTimeoutError{ctxTimeout, "stopped"}} } return DockerContainerMetadata{Error: CannotStopContainerError{err}} } } -func (dg *dockerGoClient) stopContainer(ctx context.Context, dockerID string) DockerContainerMetadata { +func (dg *dockerGoClient) stopContainer(ctx context.Context, dockerID string, timeout time.Duration) DockerContainerMetadata { client, err := dg.sdkDockerClient() if err != nil { return DockerContainerMetadata{Error: CannotGetDockerClientError{version: dg.version, err: err}} } - - err = client.ContainerStop(ctx, dockerID, &dg.config.DockerStopTimeout) + err = client.ContainerStop(ctx, dockerID, &timeout) metadata := dg.containerMetadata(ctx, dockerID) if err != nil { seelog.Infof("DockerGoClient: error stopping container %s: %v", dockerID, err) diff --git a/agent/dockerclient/dockerapi/docker_client_test.go b/agent/dockerclient/dockerapi/docker_client_test.go index 2a487aa8b44..70db0d83bf7 100644 --- a/agent/dockerclient/dockerapi/docker_client_test.go +++ b/agent/dockerclient/dockerapi/docker_client_test.go @@ -468,6 +468,7 @@ func TestStopContainerTimeout(t *testing.T) { cfg.DockerStopTimeout = xContainerShortTimeout mockDockerSDK, client, _, _, _, done := dockerClientSetupWithConfig(t, cfg) defer done() + ctxTimeoutStopContainer = xContainerShortTimeout wait := &sync.WaitGroup{} wait.Add(1) diff --git a/agent/engine/docker_task_engine.go b/agent/engine/docker_task_engine.go index efd12e68337..f829a448dc7 100644 --- a/agent/engine/docker_task_engine.go +++ b/agent/engine/docker_task_engine.go @@ -955,7 +955,13 @@ func (engine *DockerTaskEngine) startContainer(task *apitask.Task, container *ap } } startContainerBegin := time.Now() - dockerContainerMD := client.StartContainer(engine.ctx, dockerContainer.DockerID, engine.cfg.ContainerStartTimeout) + + ctxTimeoutStartContainer := container.GetStartTimeout() + if ctxTimeoutStartContainer <= 0 { + ctxTimeoutStartContainer = engine.cfg.ContainerStartTimeout + } + + dockerContainerMD := client.StartContainer(engine.ctx, dockerContainer.DockerID, ctxTimeoutStartContainer) // Get metadata through container inspection and available task information then write this to the metadata file // Performs this in the background to avoid delaying container start @@ -1095,9 +1101,13 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api } seelog.Infof("Task engine [%s]: cleaned pause container network namespace", task.Arn) } - // timeout is defined by the const 'stopContainerTimeout' and the 'DockerStopTimeout' in the config - timeout := engine.cfg.DockerStopTimeout + dockerclient.StopContainerTimeout - return engine.client.StopContainer(engine.ctx, dockerContainer.DockerID, timeout) + + apiTimeoutStopContainer := container.GetStopTimeout() + if apiTimeoutStopContainer <= 0 { + apiTimeoutStopContainer = engine.cfg.DockerStopTimeout + } + + return engine.client.StopContainer(engine.ctx, dockerContainer.DockerID, apiTimeoutStopContainer) } func (engine *DockerTaskEngine) removeContainer(task *apitask.Task, container *apicontainer.Container) error { diff --git a/agent/engine/docker_task_engine_test.go b/agent/engine/docker_task_engine_test.go index eccffc202f3..89aa3ae504a 100644 --- a/agent/engine/docker_task_engine_test.go +++ b/agent/engine/docker_task_engine_test.go @@ -1271,7 +1271,7 @@ func TestStopPauseContainerCleanupCalled(t *testing.T) { mockCNIClient.EXPECT().CleanupNS(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil), dockerClient.EXPECT().StopContainer(gomock.Any(), containerID, - defaultConfig.DockerStopTimeout+dockerclient.StopContainerTimeout, + defaultConfig.DockerStopTimeout, ).Return(dockerapi.DockerContainerMetadata{}), ) diff --git a/agent/engine/engine_unix_integ_test.go b/agent/engine/engine_unix_integ_test.go index bdcb4d07313..9ef01080e16 100644 --- a/agent/engine/engine_unix_integ_test.go +++ b/agent/engine/engine_unix_integ_test.go @@ -1447,3 +1447,42 @@ func TestGPUAssociationTask(t *testing.T) { go taskEngine.AddTask(&taskUpdate) verifyTaskIsStopped(stateChangeEvents, testTask) } + +func TestPerContainerStopTimeout(t *testing.T) { + // set the global stop timemout, but this should be ignored since the per container value is set + globalStopContainerTimeout := 1000 * time.Second + os.Setenv("ECS_CONTAINER_STOP_TIMEOUT", globalStopContainerTimeout.String()) + defer os.Unsetenv("ECS_CONTAINER_STOP_TIMEOUT") + cfg := defaultTestConfigIntegTest() + + taskEngine, _, _ := setup(cfg, nil, t) + + dockerTaskEngine := taskEngine.(*DockerTaskEngine) + + if dockerTaskEngine.cfg.DockerStopTimeout != globalStopContainerTimeout { + t.Errorf("Expect ECS_CONTAINER_STOP_TIMEOUT to be set to , %v", dockerTaskEngine.cfg.DockerStopTimeout) + } + + testTask := createTestTask("TestDockerStopTimeout") + testTask.Containers[0].Command = []string{"sh", "-c", "trap 'echo hello' SIGTERM; while true; do echo `date +%T`; sleep 1s; done;"} + testTask.Containers[0].Image = testBusyboxImage + testTask.Containers[0].Name = "test-docker-timeout" + testTask.Containers[0].StopTimeout = uint(testDockerStopTimeout.Seconds()) + + go dockerTaskEngine.AddTask(testTask) + + verifyContainerRunningStateChange(t, taskEngine) + verifyTaskRunningStateChange(t, taskEngine) + + startTime := ttime.Now() + dockerTaskEngine.stopContainer(testTask, testTask.Containers[0]) + + verifyContainerStoppedStateChange(t, taskEngine) + + if ttime.Since(startTime) < testDockerStopTimeout { + t.Errorf("Container stopped before the timeout: %v", ttime.Since(startTime)) + } + if ttime.Since(startTime) > testDockerStopTimeout+1*time.Second { + t.Errorf("Container should have stopped eariler, but stopped after %v", ttime.Since(startTime)) + } +}