Skip to content

Commit

Permalink
Modified pull image retry with 1100ms * 2^n strategy and rename Simpl…
Browse files Browse the repository at this point in the history
…eBackoffRetry to ExponentialBackoffRetry into a separate retry package
  • Loading branch information
suneyz committed Jan 25, 2019
1 parent cb84a66 commit 6f58865
Show file tree
Hide file tree
Showing 22 changed files with 448 additions and 352 deletions.
16 changes: 8 additions & 8 deletions agent/acs/handler/acs_handler.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2014-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// Copyright 2014-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
Expand All @@ -23,7 +23,7 @@ import (
"strings"
"time"

acsclient "github.com/aws/amazon-ecs-agent/agent/acs/client"
"github.com/aws/amazon-ecs-agent/agent/acs/client"
"github.com/aws/amazon-ecs-agent/agent/acs/model/ecsacs"
"github.com/aws/amazon-ecs-agent/agent/acs/update_handler"
"github.com/aws/amazon-ecs-agent/agent/api"
Expand All @@ -34,7 +34,7 @@ import (
"github.com/aws/amazon-ecs-agent/agent/eventhandler"
"github.com/aws/amazon-ecs-agent/agent/eventstream"
"github.com/aws/amazon-ecs-agent/agent/statemanager"
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/agent/utils/retry"
"github.com/aws/amazon-ecs-agent/agent/utils/ttime"
"github.com/aws/amazon-ecs-agent/agent/version"
"github.com/aws/amazon-ecs-agent/agent/wsclient"
Expand Down Expand Up @@ -88,7 +88,7 @@ type session struct {
taskHandler *eventhandler.TaskHandler
ctx context.Context
cancel context.CancelFunc
backoff utils.Backoff
backoff retry.Backoff
resources sessionResources
_heartbeatTimeout time.Duration
_heartbeatJitter time.Duration
Expand Down Expand Up @@ -145,7 +145,7 @@ func NewSession(ctx context.Context,
credentialsManager rolecredentials.Manager,
taskHandler *eventhandler.TaskHandler) Session {
resources := newSessionResources(credentialsProvider)
backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax,
backoff := retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax,
connectionBackoffJitter, connectionBackoffMultiplier)
derivedContext, cancel := context.WithCancel(ctx)

Expand Down Expand Up @@ -318,7 +318,7 @@ func (acsSession *session) startACSSession(client wsclient.ClientServer) error {
acsSession.resources.connectedToACS()

backoffResetTimer := time.AfterFunc(
utils.AddJitter(acsSession.heartbeatTimeout(), acsSession.heartbeatJitter()), func() {
retry.AddJitter(acsSession.heartbeatTimeout(), acsSession.heartbeatJitter()), func() {
// If we do not have an error connecting and remain connected for at
// least 1 or so minutes, reset the backoff. This prevents disconnect
// errors that only happen infrequently from damaging the reconnect
Expand Down Expand Up @@ -423,7 +423,7 @@ func acsWsURL(endpoint, cluster, containerInstanceArn string, taskEngine engine.
// newDisconnectionTimer creates a new time object, with a callback to
// disconnect from ACS on inactivity
func newDisconnectionTimer(client wsclient.ClientServer, timeout time.Duration, jitter time.Duration) ttime.Timer {
timer := time.AfterFunc(utils.AddJitter(timeout, jitter), func() {
timer := time.AfterFunc(retry.AddJitter(timeout, jitter), func() {
seelog.Warn("ACS Connection hasn't had any activity for too long; closing connection")
if err := client.Close(); err != nil {
seelog.Warnf("Error disconnecting: %v", err)
Expand All @@ -445,7 +445,7 @@ func anyMessageHandler(timer ttime.Timer, client wsclient.ClientServer) func(int
}

// Reset heartbeat timer
timer.Reset(utils.AddJitter(heartbeatTimeout, heartbeatJitter))
timer.Reset(retry.AddJitter(heartbeatTimeout, heartbeatJitter))
}
}

Expand Down
22 changes: 11 additions & 11 deletions agent/acs/handler/acs_handler_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// +build unit

// Copyright 2014-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// Copyright 2014-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
Expand Down Expand Up @@ -41,8 +41,8 @@ import (
"github.com/aws/amazon-ecs-agent/agent/eventhandler"
"github.com/aws/amazon-ecs-agent/agent/eventstream"
"github.com/aws/amazon-ecs-agent/agent/statemanager"
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/agent/utils/mocks"
"github.com/aws/amazon-ecs-agent/agent/utils/retry"
"github.com/aws/amazon-ecs-agent/agent/version"
"github.com/aws/amazon-ecs-agent/agent/wsclient"
"github.com/aws/amazon-ecs-agent/agent/wsclient/mock"
Expand Down Expand Up @@ -234,7 +234,7 @@ func TestHandlerReconnectsOnConnectErrors(t *testing.T) {
ecsClient: ecsClient,
stateManager: stateManager,
taskHandler: taskHandler,
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
ctx: ctx,
cancel: cancel,
resources: &mockSessionResources{mockWsClient},
Expand Down Expand Up @@ -501,7 +501,7 @@ func TestHandlerGeneratesDeregisteredInstanceEvent(t *testing.T) {
deregisterInstanceEventStream: deregisterInstanceEventStream,
stateManager: stateManager,
taskHandler: taskHandler,
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
ctx: ctx,
cancel: cancel,
resources: &mockSessionResources{mockWsClient},
Expand Down Expand Up @@ -570,7 +570,7 @@ func TestHandlerReconnectDelayForInactiveInstanceError(t *testing.T) {
deregisterInstanceEventStream: deregisterInstanceEventStream,
stateManager: stateManager,
taskHandler: taskHandler,
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
ctx: ctx,
cancel: cancel,
resources: &mockSessionResources{mockWsClient},
Expand Down Expand Up @@ -627,7 +627,7 @@ func TestHandlerReconnectsOnServeErrors(t *testing.T) {
ecsClient: ecsClient,
stateManager: stateManager,
taskHandler: taskHandler,
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
ctx: ctx,
cancel: cancel,
resources: &mockSessionResources{mockWsClient},
Expand Down Expand Up @@ -678,7 +678,7 @@ func TestHandlerStopsWhenContextIsCancelled(t *testing.T) {
ecsClient: ecsClient,
stateManager: stateManager,
taskHandler: taskHandler,
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
ctx: ctx,
cancel: cancel,
resources: &mockSessionResources{mockWsClient},
Expand Down Expand Up @@ -732,7 +732,7 @@ func TestHandlerReconnectsOnDiscoverPollEndpointError(t *testing.T) {
ecsClient: ecsClient,
stateManager: stateManager,
taskHandler: taskHandler,
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
ctx: ctx,
cancel: cancel,
resources: &mockSessionResources{mockWsClient},
Expand Down Expand Up @@ -806,7 +806,7 @@ func TestConnectionIsClosedOnIdle(t *testing.T) {
stateManager: stateManager,
taskHandler: taskHandler,
ctx: context.Background(),
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
resources: &mockSessionResources{},
_heartbeatTimeout: 20 * time.Millisecond,
_heartbeatJitter: 10 * time.Millisecond,
Expand Down Expand Up @@ -861,7 +861,7 @@ func TestHandlerDoesntLeakGoroutines(t *testing.T) {
taskHandler: taskHandler,
ctx: ctx,
_heartbeatTimeout: 1 * time.Second,
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
resources: newSessionResources(testCreds),
credentialsManager: rolecredentials.NewManager(),
}
Expand Down Expand Up @@ -1062,7 +1062,7 @@ func TestHandlerReconnectsCorrectlySetsSendCredentialsURLParameter(t *testing.T)
taskHandler: taskHandler,
ctx: ctx,
resources: resources,
backoff: utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
backoff: retry.NewExponentialBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier),
_heartbeatTimeout: 20 * time.Millisecond,
_heartbeatJitter: 10 * time.Millisecond,
}
Expand Down
16 changes: 9 additions & 7 deletions agent/dockerclient/dockerapi/docker_client.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2014-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// Copyright 2014-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
Expand Down Expand Up @@ -41,6 +41,7 @@ import (
"github.com/aws/amazon-ecs-agent/agent/ecr"
"github.com/aws/amazon-ecs-agent/agent/metrics"
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/agent/utils/retry"
"github.com/aws/amazon-ecs-agent/agent/utils/ttime"

"github.com/cihub/seelog"
Expand Down Expand Up @@ -84,9 +85,9 @@ const (

// retry settings for pulling images
maximumPullRetries = 5
minimumPullRetryDelay = 250 * time.Millisecond
maximumPullRetryDelay = 1 * time.Second
pullRetryDelayMultiplier = 1.5
minimumPullRetryDelay = 1100 * time.Millisecond
maximumPullRetryDelay = 5 * time.Second
pullRetryDelayMultiplier = 2
pullRetryJitterMultiplier = 0.2
)

Expand Down Expand Up @@ -205,6 +206,7 @@ type dockerGoClient struct {
ecrTokenCache async.Cache
config *config.Config
context context.Context
imagePullBackoff retry.Backoff

_time ttime.Time
_timeOnce sync.Once
Expand Down Expand Up @@ -270,6 +272,8 @@ func NewDockerGoClient(sdkclientFactory sdkclientfactory.Factory,
ecrTokenCache: async.NewLRUCache(tokenCacheSize, tokenCacheTTL),
config: cfg,
context: ctx,
imagePullBackoff: retry.NewExponentialBackoff(minimumPullRetryDelay, maximumPullRetryDelay,
pullRetryJitterMultiplier, pullRetryDelayMultiplier),
}, nil
}

Expand Down Expand Up @@ -298,9 +302,7 @@ func (dg *dockerGoClient) PullImage(image string, authData *apicontainer.Registr
defer metrics.MetricsEngineGlobal.RecordDockerMetric("PULL_IMAGE")()
response := make(chan DockerContainerMetadata, 1)
go func() {
imagePullBackoff := utils.NewSimpleBackoff(minimumPullRetryDelay,
maximumPullRetryDelay, pullRetryJitterMultiplier, pullRetryDelayMultiplier)
err := utils.RetryNWithBackoffCtx(ctx, imagePullBackoff, maximumPullRetries,
err := retry.RetryNWithBackoffCtx(ctx, dg.imagePullBackoff, maximumPullRetries,
func() error {
err := dg.pullImage(ctx, image, authData)
if err != nil {
Expand Down
14 changes: 13 additions & 1 deletion agent/dockerclient/dockerapi/docker_client_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// +build unit

// Copyright 2014-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// Copyright 2014-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
Expand Down Expand Up @@ -39,6 +39,7 @@ import (
"github.com/aws/amazon-ecs-agent/agent/ec2"
"github.com/aws/amazon-ecs-agent/agent/ecr/mocks"
ecrapi "github.com/aws/amazon-ecs-agent/agent/ecr/model/ecr"
"github.com/aws/amazon-ecs-agent/agent/utils/retry"
"github.com/aws/amazon-ecs-agent/agent/utils/ttime/mocks"

"github.com/aws/aws-sdk-go/aws"
Expand All @@ -65,6 +66,15 @@ const xContainerShortTimeout = 1 * time.Millisecond
// upon the expiration of the timeout duration.
const xImageShortTimeout = 1 * time.Millisecond

const (
// retry settings for pulling images mock backoff
xMaximumPullRetries = 5
xMinimumPullRetryDelay = 25 * time.Millisecond
xMaximumPullRetryDelay = 100 * time.Microsecond
xPullRetryDelayMultiplier = 2
xPullRetryJitterMultiplier = 0.2
)

func defaultTestConfig() *config.Config {
cfg, _ := config.NewConfig(ec2.NewBlackholeEC2MetadataClient())
return cfg
Expand Down Expand Up @@ -104,6 +114,8 @@ func dockerClientSetupWithConfig(t *testing.T, conf config.Config) (
ecrClientFactory := mock_ecr.NewMockECRFactory(ctrl)
goClient.ecrClientFactory = ecrClientFactory
goClient._time = mockTime
goClient.imagePullBackoff = retry.NewExponentialBackoff(xMinimumPullRetryDelay, xMaximumPullRetryDelay,
xPullRetryJitterMultiplier, xPullRetryDelayMultiplier)
return mockDockerSDK, goClient, mockTime, ctrl, ecrClientFactory, ctrl.Finish
}

Expand Down
6 changes: 3 additions & 3 deletions agent/dockerclient/dockerauth/ecr.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2014-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// Copyright 2014-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
Expand All @@ -24,7 +24,7 @@ import (
"github.com/aws/amazon-ecs-agent/agent/credentials"
"github.com/aws/amazon-ecs-agent/agent/ecr"
ecrapi "github.com/aws/amazon-ecs-agent/agent/ecr/model/ecr"
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/agent/utils/retry"
"github.com/aws/aws-sdk-go/aws"
log "github.com/cihub/seelog"
"github.com/docker/docker/api/types"
Expand Down Expand Up @@ -182,7 +182,7 @@ func (authProvider *ecrAuthProvider) IsTokenValid(authData *ecrapi.Authorization
}

refreshTime := aws.TimeValue(authData.ExpiresAt).
Add(-1 * utils.AddJitter(MinimumJitterDuration, MinimumJitterDuration))
Add(-1 * retry.AddJitter(MinimumJitterDuration, MinimumJitterDuration))

return time.Now().Before(refreshTime)
}
35 changes: 18 additions & 17 deletions agent/engine/docker_task_engine.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2014-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// Copyright 2014-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
Expand Down Expand Up @@ -41,6 +41,7 @@ import (
"github.com/aws/amazon-ecs-agent/agent/statemanager"
"github.com/aws/amazon-ecs-agent/agent/taskresource"
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/agent/utils/retry"
utilsync "github.com/aws/amazon-ecs-agent/agent/utils/sync"
"github.com/aws/amazon-ecs-agent/agent/utils/ttime"

Expand All @@ -53,20 +54,19 @@ const (
//DockerEndpointEnvVariable is the environment variable that can override the Docker endpoint
DockerEndpointEnvVariable = "DOCKER_HOST"
// DockerDefaultEndpoint is the default value for the Docker endpoint
DockerDefaultEndpoint = "unix:///var/run/docker.sock"
capabilityPrefix = "com.amazonaws.ecs.capability."
capabilityTaskIAMRole = "task-iam-role"
capabilityTaskIAMRoleNetHost = "task-iam-role-network-host"
capabilityTaskCPUMemLimit = "task-cpu-mem-limit"
attributePrefix = "ecs.capability."
labelPrefix = "com.amazonaws.ecs."
labelTaskARN = labelPrefix + "task-arn"
labelContainerName = labelPrefix + "container-name"
labelTaskDefinitionFamily = labelPrefix + "task-definition-family"
labelTaskDefinitionVersion = labelPrefix + "task-definition-version"
labelCluster = labelPrefix + "cluster"
cniSetupTimeout = 1 * time.Minute
cniCleanupTimeout = 30 * time.Second
DockerDefaultEndpoint = "unix:///var/run/docker.sock"
labelPrefix = "com.amazonaws.ecs."
labelTaskARN = labelPrefix + "task-arn"
labelContainerName = labelPrefix + "container-name"
labelTaskDefinitionFamily = labelPrefix + "task-definition-family"
labelTaskDefinitionVersion = labelPrefix + "task-definition-version"
labelCluster = labelPrefix + "cluster"
cniSetupTimeout = 1 * time.Minute
cniCleanupTimeout = 30 * time.Second
minEngineConnectRetryDelay = 200 * time.Second
maxEngineConnectRetryDelay = 2 * time.Second
engineConnectRetryJitterMultiplier = 0.20
engineConnectRetryDelayMultiplier = 1.5
)

// DockerTaskEngine is a state machine for managing a task and its containers
Expand Down Expand Up @@ -229,8 +229,9 @@ func (engine *DockerTaskEngine) MustInit(ctx context.Context) {
defer engine.mustInitLock.Unlock()

errorOnce := sync.Once{}
taskEngineConnectBackoff := utils.NewSimpleBackoff(200*time.Millisecond, 2*time.Second, 0.20, 1.5)
utils.RetryWithBackoff(taskEngineConnectBackoff, func() error {
taskEngineConnectBackoff := retry.NewExponentialBackoff(minEngineConnectRetryDelay, maxEngineConnectRetryDelay,
engineConnectRetryJitterMultiplier, engineConnectRetryDelayMultiplier)
retry.RetryWithBackoff(taskEngineConnectBackoff, func() error {
if engine.initialized {
return nil
}
Expand Down
8 changes: 4 additions & 4 deletions agent/eni/networkutils/utils_linux.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// +build linux

// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
Expand All @@ -24,7 +24,7 @@ import (

apierrors "github.com/aws/amazon-ecs-agent/agent/api/errors"
"github.com/aws/amazon-ecs-agent/agent/eni/netlinkwrapper"
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/agent/utils/retry"
"github.com/pkg/errors"

"github.com/cihub/seelog"
Expand Down Expand Up @@ -79,12 +79,12 @@ func GetMACAddress(ctx context.Context,
// address is empty, it retries the operation with a timeout specified by the
// caller
func (retriever *macAddressRetriever) retrieve() (string, error) {
backoff := utils.NewSimpleBackoff(macAddressBackoffMin, macAddressBackoffMax,
backoff := retry.NewExponentialBackoff(macAddressBackoffMin, macAddressBackoffMax,
macAddressBackoffJitter, macAddressBackoffMultiple)
ctx, cancel := context.WithTimeout(retriever.ctx, retriever.timeout)
defer cancel()

err := utils.RetryWithBackoffCtx(ctx, backoff, func() error {
err := retry.RetryWithBackoffCtx(ctx, backoff, func() error {
retErr := retriever.retrieveOnce()
if retErr != nil {
seelog.Warnf("Unable to retrieve mac address for device '%s': %v",
Expand Down
Loading

0 comments on commit 6f58865

Please sign in to comment.