Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate with tcsHandler in ecs-agent module #3743

Merged
merged 4 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 13 additions & 16 deletions agent/app/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ import (
"github.com/aws/amazon-ecs-agent/agent/sighandlers/exitcodes"
"github.com/aws/amazon-ecs-agent/agent/statemanager"
"github.com/aws/amazon-ecs-agent/agent/stats"
"github.com/aws/amazon-ecs-agent/agent/stats/reporter"
"github.com/aws/amazon-ecs-agent/agent/taskresource"
tcshandler "github.com/aws/amazon-ecs-agent/agent/tcs/handler"
"github.com/aws/amazon-ecs-agent/agent/utils"
"github.com/aws/amazon-ecs-agent/agent/utils/loader"
"github.com/aws/amazon-ecs-agent/agent/utils/mobypkgwrapper"
Expand Down Expand Up @@ -871,21 +871,18 @@ func (agent *ecsAgent) startAsyncRoutines(
}
go statsEngine.StartMetricsPublish()

telemetrySessionParams := tcshandler.TelemetrySessionParams{
Ctx: agent.ctx,
CredentialProvider: agent.credentialProvider,
Cfg: agent.cfg,
ContainerInstanceArn: agent.containerInstanceARN,
DeregisterInstanceEventStream: deregisterInstanceEventStream,
ECSClient: client,
TaskEngine: taskEngine,
StatsEngine: statsEngine,
MetricsChannel: telemetryMessages,
HealthChannel: healthMessages,
Doctor: doctor,
}
// Start metrics session in a go routine
go tcshandler.StartMetricsSession(&telemetrySessionParams)
session, err := reporter.NewDockerTelemetrySession(agent.containerInstanceARN, agent.credentialProvider, agent.cfg, deregisterInstanceEventStream,
client, taskEngine, telemetryMessages, healthMessages, doctor)
if err != nil {
seelog.Warnf("Error creating telemetry session: %v", err)
return
}
if session == nil {
seelog.Infof("Metrics disabled on the instance.")
return
}

go session.Start(agent.ctx)
}

func (agent *ecsAgent) startSpotInstanceDrainingPoller(ctx context.Context, client api.ECSClient) {
Expand Down
2 changes: 1 addition & 1 deletion agent/app/agent_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ func TestDoStartCgroupInitHappyPath(t *testing.T) {
}).Return("poll-endpoint", nil),
client.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return("acs-endpoint", nil).AnyTimes(),
client.EXPECT().DiscoverTelemetryEndpoint(gomock.Any()).Do(func(x interface{}) {
// Ensures that the test waits until telemetry session has bee started
// Ensures that the test waits until telemetry session has been started
discoverEndpointsInvoked.Done()
}).Return("telemetry-endpoint", nil),
client.EXPECT().DiscoverTelemetryEndpoint(gomock.Any()).Return(
Expand Down
58 changes: 40 additions & 18 deletions agent/stats/reporter/reporter.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package reporter

import (
Expand All @@ -19,7 +32,6 @@ import (
"github.com/aws/amazon-ecs-agent/ecs-agent/utils/retry"
"github.com/aws/amazon-ecs-agent/ecs-agent/wsclient"
"github.com/aws/aws-sdk-go/aws/credentials"
"github.com/cihub/seelog"
)

const (
Expand All @@ -29,6 +41,10 @@ const (
// Default websocket client disconnection timeout initiated by agent
defaultDisconnectionTimeout = 15 * time.Minute
defaultDisconnectionJitter = 30 * time.Minute
backoffMin = 1 * time.Second
backoffMax = 1 * time.Minute
jitterMultiple = 0.2
multiple = 2
)

type DockerTelemetrySession struct {
Expand All @@ -48,22 +64,20 @@ func NewDockerTelemetrySession(
taskEngine engine.TaskEngine,
metricsChannel <-chan ecstcs.TelemetryMessage,
healthChannel <-chan ecstcs.HealthMessage,
doctor *doctor.Doctor) *DockerTelemetrySession {
doctor *doctor.Doctor) (*DockerTelemetrySession, error) {
ok, cfgParseErr := isContainerHealthMetricsDisabled(cfg)
if cfgParseErr != nil {
seelog.Warnf("Error starting metrics session: %v", cfgParseErr)
return nil
logger.Warn("Error starting metrics session", logger.Fields{
field.Error: cfgParseErr,
})
return nil, cfgParseErr
}
if ok {
seelog.Warnf("Metrics were disabled, not starting the telemetry session")
return nil
logger.Warn("Metrics were disabled, not starting the telemetry session")
return nil, nil
}

agentVersion, agentHash, containerRuntimeVersion := generateVersionInfo(taskEngine)
if cfg == nil {
Realmonia marked this conversation as resolved.
Show resolved Hide resolved
logger.Error("Config is empty in the tcs session parameter")
return nil
}

session := tcshandler.NewTelemetrySession(
containerInstanceArn,
Expand All @@ -90,27 +104,35 @@ func NewDockerTelemetrySession(
healthChannel,
doctor,
)
return &DockerTelemetrySession{session, ecsClient, containerInstanceArn}
return &DockerTelemetrySession{session, ecsClient, containerInstanceArn}, nil
}

// Start "overloads" tcshandler.TelemetrySession's Start with extra handling of discoverTelemetryEndpoint result.
// discoverTelemetryEndpoint and tcshandler.TelemetrySession's StartTelemetrySession errors are handled
// (retryWithBackoff or return) in a combined manner
func (session *DockerTelemetrySession) Start(ctx context.Context) error {
backoff := retry.NewExponentialBackoff(time.Second, 1*time.Minute, 0.2, 2)
backoff := retry.NewExponentialBackoff(backoffMin, backoffMax, jitterMultiple, multiple)
for {
select {
case <-ctx.Done():
logger.Info("ECS Telemetry service (TCS) session exited cleanly.")
return nil
default:
}
endpoint, tcsError := discoverPollEndpoint(session.containerInstanceArn, session.ecsClient)
if tcsError == nil {
// returning from StartTelemetrySession indicates a disconnection, need to reconnect.
tcsError = session.s.StartTelemetrySession(ctx, endpoint)
}
switch tcsError {
case context.Canceled, context.DeadlineExceeded:
return tcsError
case io.EOF, nil:
if tcsError == nil || tcsError == io.EOF {
Realmonia marked this conversation as resolved.
Show resolved Hide resolved
// reset backoff when TCS closed for a valid reason, such as connection expiring due to inactivity
logger.Info("TCS Websocket connection closed for a valid reason")
backoff.Reset()
default:
seelog.Errorf("Error: lost websocket connection with ECS Telemetry service (TCS): %v", tcsError)
} else {
// backoff when there is unexpected error, such as invalid frame sent through connection.
logger.Error("Error: lost websocket connection with ECS Telemetry service (TCS)", logger.Fields{
field.Error: tcsError,
})
time.Sleep(backoff.Duration())
}
}
Expand Down
138 changes: 138 additions & 0 deletions agent/stats/reporter/reporter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package reporter

import (
"context"
"errors"
"testing"

"github.com/aws/amazon-ecs-agent/agent/config"
mock_engine "github.com/aws/amazon-ecs-agent/agent/engine/mocks"
"github.com/aws/amazon-ecs-agent/agent/version"
"github.com/aws/amazon-ecs-agent/ecs-agent/doctor"
"github.com/aws/amazon-ecs-agent/ecs-agent/eventstream"
"github.com/aws/aws-sdk-go/aws/credentials"
"github.com/golang/mock/gomock"
"github.com/stretchr/testify/assert"
)

const (
testContainerInstanceArn = "testContainerInstanceArn"
testCluster = "testCluster"
testRegion = "us-west-2"
testDockerEndpoint = "testDockerEndpoint"
testDockerVersion = "testDockerVersion"
)

func TestNewDockerTelemetrySession(t *testing.T) {
emptyDoctor, _ := doctor.NewDoctor([]doctor.Healthcheck{}, testCluster, testContainerInstanceArn)
testCredentials := credentials.NewStaticCredentials("test-id", "test-secret", "test-token")
ctrl := gomock.NewController(t)
defer ctrl.Finish()
mockEngine := mock_engine.NewMockTaskEngine(ctrl)
mockEngine.EXPECT().Version().Return(testDockerVersion, nil)
testCases := []struct {
name string
cfg *config.Config
expectedSession bool
expectedError bool
}{
{
name: "happy case",
cfg: &config.Config{
DisableMetrics: config.BooleanDefaultFalse{},
DisableDockerHealthCheck: config.BooleanDefaultFalse{},
Cluster: testCluster,
AWSRegion: testRegion,
AcceptInsecureCert: false,
DockerEndpoint: testDockerEndpoint,
},
expectedSession: true,
expectedError: false,
},
{
name: "cfg parsing error",
cfg: nil,
expectedSession: false,
expectedError: true,
},
{
name: "metrics disabled",
cfg: &config.Config{
DisableMetrics: config.BooleanDefaultFalse{
Value: config.ExplicitlyEnabled,
},
DisableDockerHealthCheck: config.BooleanDefaultFalse{
Value: config.ExplicitlyEnabled,
},
Cluster: testCluster,
AWSRegion: testRegion,
AcceptInsecureCert: false,
DockerEndpoint: testDockerEndpoint,
},
expectedSession: false,
expectedError: false,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
dockerTelemetrySession, err := NewDockerTelemetrySession(
testContainerInstanceArn,
testCredentials,
tc.cfg,
eventstream.NewEventStream("Deregister_Instance", context.Background()),
nil,
mockEngine,
nil,
nil,
emptyDoctor,
)
if tc.expectedSession {
assert.NotNil(t, dockerTelemetrySession)
} else {
assert.Nil(t, dockerTelemetrySession)
}

if tc.expectedError {
assert.NotNil(t, err)
} else {
assert.NoError(t, err)
}
})
}
}

func TestGenerateVersionInfo_GetVersionError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
mockEngine := mock_engine.NewMockTaskEngine(ctrl)
mockEngine.EXPECT().Version().Times(1).Return("", errors.New("error"))
agentVersion, agentHash, containerRuntimeVersion := generateVersionInfo(mockEngine)
assert.Equal(t, version.Version, agentVersion)
assert.Equal(t, version.GitShortHash, agentHash)
assert.Equal(t, "", containerRuntimeVersion)
}

func TestGenerateVersionInfo_NoError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
mockEngine := mock_engine.NewMockTaskEngine(ctrl)
mockEngine.EXPECT().Version().Times(1).Return(testDockerVersion, nil)
agentVersion, agentHash, containerRuntimeVersion := generateVersionInfo(mockEngine)
assert.Equal(t, version.Version, agentVersion)
assert.Equal(t, version.GitShortHash, agentHash)
assert.Equal(t, testDockerVersion, containerRuntimeVersion)
}
Loading