-
Notifications
You must be signed in to change notification settings - Fork 616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
engine,dockerclient: per container start/stop timeouts #1849
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,6 +91,8 @@ const ( | |
pullRetryJitterMultiplier = 0.2 | ||
) | ||
|
||
var ctxTimeoutStopContainer = dockerclient.StopContainerTimeout | ||
|
||
// DockerClient interface to make testing it easier | ||
type DockerClient interface { | ||
// SupportedVersions returns a slice of the supported docker versions (or at least supposedly supported). | ||
|
@@ -550,8 +552,8 @@ func (dg *dockerGoClient) createContainer(ctx context.Context, | |
return dg.containerMetadata(ctx, dockerContainer.ID) | ||
} | ||
|
||
func (dg *dockerGoClient) StartContainer(ctx context.Context, id string, timeout time.Duration) DockerContainerMetadata { | ||
ctx, cancel := context.WithTimeout(ctx, timeout) | ||
func (dg *dockerGoClient) StartContainer(ctx context.Context, id string, ctxTimeout time.Duration) DockerContainerMetadata { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add similar comment for ctxTimeout here as well? is there valid range of values for this like it does for stop timeout? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, this is just what we chose for the start container timeout. we have some sane defaults here #1321, but since we're allowing it to be configurable from the task def we don't have a range check at this layer. the service is enforcing a lower bound of 2s and no upper bound. the docker daemon's start container api doesnt enforce timeouts. |
||
ctx, cancel := context.WithTimeout(ctx, ctxTimeout) | ||
defer cancel() | ||
defer metrics.MetricsEngineGlobal.RecordDockerMetric("START_CONTAINER")() | ||
// Buffered channel so in the case of timeout it takes one write, never gets | ||
|
@@ -566,7 +568,7 @@ func (dg *dockerGoClient) StartContainer(ctx context.Context, id string, timeout | |
// send back the DockerTimeoutError | ||
err := ctx.Err() | ||
if err == context.DeadlineExceeded { | ||
return DockerContainerMetadata{Error: &DockerTimeoutError{timeout, "started"}} | ||
return DockerContainerMetadata{Error: &DockerTimeoutError{ctxTimeout, "started"}} | ||
} | ||
return DockerContainerMetadata{Error: CannotStartContainerError{err}} | ||
} | ||
|
@@ -654,13 +656,16 @@ func (dg *dockerGoClient) inspectContainer(ctx context.Context, dockerID string) | |
} | ||
|
||
func (dg *dockerGoClient) StopContainer(ctx context.Context, dockerID string, timeout time.Duration) DockerContainerMetadata { | ||
ctx, cancel := context.WithTimeout(ctx, timeout) | ||
// ctxTimeout is sum of timeout(applied to the StopContainer api call) and a fixed constant dockerclient.StopContainerTimeout | ||
// the context's timeout should be greater than the sigkill timout for the StopContainer call | ||
ctxTimeout := timeout + ctxTimeoutStopContainer | ||
ctx, cancel := context.WithTimeout(ctx, ctxTimeout) | ||
defer cancel() | ||
defer metrics.MetricsEngineGlobal.RecordDockerMetric("STOP_CONTAINER")() | ||
// Buffered channel so in the case of timeout it takes one write, never gets | ||
// read, and can still be GC'd | ||
response := make(chan DockerContainerMetadata, 1) | ||
go func() { response <- dg.stopContainer(ctx, dockerID) }() | ||
go func() { response <- dg.stopContainer(ctx, dockerID, timeout) }() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be passing ctxTimeout? What is the difference between timeout and ctxTimeout? For my understanding, why isn't startContainer handled the same with timeout? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
so i have some information in the comment above this code and also the PR description. but also - ctxTimeout is for the timeout for the context the agent uses for the actual call to docker, where as timeout is a parameter for the api call to stop container call. the timeout parameter is how long the docker daemon will wait after a docker stop call to sigkill the container. we want our context timeout to be greater than the daemon's sigkill timeout. |
||
select { | ||
case resp := <-response: | ||
return resp | ||
|
@@ -669,19 +674,18 @@ func (dg *dockerGoClient) StopContainer(ctx context.Context, dockerID string, ti | |
// send back the DockerTimeoutError | ||
err := ctx.Err() | ||
if err == context.DeadlineExceeded { | ||
return DockerContainerMetadata{Error: &DockerTimeoutError{timeout, "stopped"}} | ||
return DockerContainerMetadata{Error: &DockerTimeoutError{ctxTimeout, "stopped"}} | ||
} | ||
return DockerContainerMetadata{Error: CannotStopContainerError{err}} | ||
} | ||
} | ||
|
||
func (dg *dockerGoClient) stopContainer(ctx context.Context, dockerID string) DockerContainerMetadata { | ||
func (dg *dockerGoClient) stopContainer(ctx context.Context, dockerID string, timeout time.Duration) DockerContainerMetadata { | ||
client, err := dg.sdkDockerClient() | ||
if err != nil { | ||
return DockerContainerMetadata{Error: CannotGetDockerClientError{version: dg.version, err: err}} | ||
} | ||
|
||
err = client.ContainerStop(ctx, dockerID, &dg.config.DockerStopTimeout) | ||
err = client.ContainerStop(ctx, dockerID, &timeout) | ||
metadata := dg.containerMetadata(ctx, dockerID) | ||
if err != nil { | ||
seelog.Infof("DockerGoClient: error stopping container %s: %v", dockerID, err) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -955,7 +955,13 @@ func (engine *DockerTaskEngine) startContainer(task *apitask.Task, container *ap | |
} | ||
} | ||
startContainerBegin := time.Now() | ||
dockerContainerMD := client.StartContainer(engine.ctx, dockerContainer.DockerID, engine.cfg.ContainerStartTimeout) | ||
|
||
ctxTimeoutStartContainer := container.GetStartTimeout() | ||
if ctxTimeoutStartContainer <= 0 { | ||
ctxTimeoutStartContainer = engine.cfg.ContainerStartTimeout | ||
} | ||
|
||
dockerContainerMD := client.StartContainer(engine.ctx, dockerContainer.DockerID, ctxTimeoutStartContainer) | ||
|
||
// Get metadata through container inspection and available task information then write this to the metadata file | ||
// Performs this in the background to avoid delaying container start | ||
|
@@ -1095,9 +1101,13 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api | |
} | ||
seelog.Infof("Task engine [%s]: cleaned pause container network namespace", task.Arn) | ||
} | ||
// timeout is defined by the const 'stopContainerTimeout' and the 'DockerStopTimeout' in the config | ||
timeout := engine.cfg.DockerStopTimeout + dockerclient.StopContainerTimeout | ||
return engine.client.StopContainer(engine.ctx, dockerContainer.DockerID, timeout) | ||
|
||
apiTimeoutStopContainer := container.GetStopTimeout() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about calling this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i dont have a strong feeling either way tbh |
||
if apiTimeoutStopContainer <= 0 { | ||
apiTimeoutStopContainer = engine.cfg.DockerStopTimeout | ||
} | ||
|
||
return engine.client.StopContainer(engine.ctx, dockerContainer.DockerID, apiTimeoutStopContainer) | ||
} | ||
|
||
func (engine *DockerTaskEngine) removeContainer(task *apitask.Task, container *apicontainer.Container) error { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1447,3 +1447,42 @@ func TestGPUAssociationTask(t *testing.T) { | |
go taskEngine.AddTask(&taskUpdate) | ||
verifyTaskIsStopped(stateChangeEvents, testTask) | ||
} | ||
|
||
func TestPerContainerStopTimeout(t *testing.T) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there any reason why this is under unix? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. adding TODO for windows test. |
||
// set the global stop timemout, but this should be ignored since the per container value is set | ||
globalStopContainerTimeout := 1000 * time.Second | ||
os.Setenv("ECS_CONTAINER_STOP_TIMEOUT", globalStopContainerTimeout.String()) | ||
defer os.Unsetenv("ECS_CONTAINER_STOP_TIMEOUT") | ||
cfg := defaultTestConfigIntegTest() | ||
|
||
taskEngine, _, _ := setup(cfg, nil, t) | ||
|
||
dockerTaskEngine := taskEngine.(*DockerTaskEngine) | ||
|
||
if dockerTaskEngine.cfg.DockerStopTimeout != globalStopContainerTimeout { | ||
t.Errorf("Expect ECS_CONTAINER_STOP_TIMEOUT to be set to , %v", dockerTaskEngine.cfg.DockerStopTimeout) | ||
} | ||
|
||
testTask := createTestTask("TestDockerStopTimeout") | ||
testTask.Containers[0].Command = []string{"sh", "-c", "trap 'echo hello' SIGTERM; while true; do echo `date +%T`; sleep 1s; done;"} | ||
testTask.Containers[0].Image = testBusyboxImage | ||
testTask.Containers[0].Name = "test-docker-timeout" | ||
testTask.Containers[0].StopTimeout = uint(testDockerStopTimeout.Seconds()) | ||
|
||
go dockerTaskEngine.AddTask(testTask) | ||
|
||
verifyContainerRunningStateChange(t, taskEngine) | ||
verifyTaskRunningStateChange(t, taskEngine) | ||
|
||
startTime := ttime.Now() | ||
dockerTaskEngine.stopContainer(testTask, testTask.Containers[0]) | ||
|
||
verifyContainerStoppedStateChange(t, taskEngine) | ||
|
||
if ttime.Since(startTime) < testDockerStopTimeout { | ||
t.Errorf("Container stopped before the timeout: %v", ttime.Since(startTime)) | ||
} | ||
if ttime.Since(startTime) > testDockerStopTimeout+1*time.Second { | ||
t.Errorf("Container should have stopped eariler, but stopped after %v", ttime.Since(startTime)) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,7 +78,10 @@ const ( | |
// a) Add 'Associations' field to 'api.task.task' | ||
// b) Add 'GPUIDs' field to 'apicontainer.Container' | ||
// c) Add 'NvidiaRuntime' field to 'api.task.task' | ||
// 20) Add 'DependsOn' field to 'apicontainer.Container' | ||
// 20) | ||
// a) Add 'DependsOn' field to 'apicontainer.Container' | ||
// b) Add 'StartTime' field to 'api.container.Container' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you please add a test for the state file version bump in the state manager package? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch, will add state manager unit tests for this. |
||
// c) Add 'StopTime' field to 'api.container.Container' | ||
ECSDataVersion = 20 | ||
|
||
// ecsDataFile specifies the filename in the ECS_DATADIR | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we make sure ECS service treats it as int64 as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we're treating this field as we do with cpu/mem