Skip to content

Commit

Permalink
Make CGroups CPU period configurable
Browse files Browse the repository at this point in the history
Signed-off-by: Mohamad Arab <[email protected]>
  • Loading branch information
boynux authored and Mohamad Arab committed Oct 19, 2018
1 parent 3d36855 commit afb836c
Show file tree
Hide file tree
Showing 12 changed files with 94 additions and 19 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ additional details on each available environment variable.
| `ECS_HOST_DATA_DIR` | `/var/lib/ecs` | The source directory on the host from which ECS_DATADIR is mounted. We use this to determine the source mount path for container metadata files in the case the ECS Agent is running as a container. We do not use this value in Windows because the ECS Agent is not running as container in Windows. | `/var/lib/ecs` | `Not used` |
| `ECS_ENABLE_TASK_CPU_MEM_LIMIT` | `true` | Whether to enable task-level cpu and memory limits | `true` | `false` |
| `ECS_CGROUP_PATH` | `/sys/fs/cgroup` | The root cgroup path that is expected by the ECS agent. This is the path that accessible from the agent mount. | `/sys/fs/cgroup` | Not applicable |
| `ECS_CGROUP_CPU_PERIOD` | `10ms` | CGroups CPU period for task level limits. | `100ms` | Not applicable |
| `ECS_ENABLE_CPU_UNBOUNDED_WINDOWS_WORKAROUND` | `true` | When `true`, ECS will allow CPU unbounded(CPU=`0`) tasks to run along with CPU bounded tasks in Windows. | Not applicable | `false` |
| `ECS_TASK_METADATA_RPS_LIMIT` | `100,150` | Comma separated integer values for steady state and burst throttle limits for task metadata endpoint | `40,60` | `40,60` |
| `ECS_SHARED_VOLUME_MATCH_FULL_CONFIG` | `true` | When `true`, ECS Agent will compare name, driver options, and labels to make sure volumes are identical. When `false`, Agent will short circuit shared volume comparison if the names match. This is the default Docker behavior. If a volume is shared across instances, this should be set to `false`. | `false` | `false`|
Expand Down
2 changes: 1 addition & 1 deletion agent/api/task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ func (task *Task) PostUnmarshalTask(cfg *config.Config,
// hook into this
task.adjustForPlatform(cfg)
if task.MemoryCPULimitsEnabled {
err := task.initializeCgroupResourceSpec(cfg.CgroupPath, resourceFields)
err := task.initializeCgroupResourceSpec(cfg.CgroupPath, cfg.CGroupCPUPeriod, resourceFields)
if err != nil {
seelog.Errorf("Task [%s]: could not intialize resource: %v", task.Arn, err)
return apierrors.NewResourceInitError(task.Arn, err)
Expand Down
13 changes: 6 additions & 7 deletions agent/api/task/task_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ const (
//and is maintained here for unix default. Also used for testing
memorySwappinessDefault = 0

defaultCPUPeriod = 100 * time.Millisecond // 100ms
// With a 100ms CPU period, we can express 0.01 vCPU to 10 vCPUs
maxTaskVCPULimit = 10
// Reference: http://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
Expand All @@ -55,12 +54,12 @@ func (task *Task) adjustForPlatform(cfg *config.Config) {
task.MemoryCPULimitsEnabled = cfg.TaskCPUMemLimit.Enabled()
}

func (task *Task) initializeCgroupResourceSpec(cgroupPath string, resourceFields *taskresource.ResourceFields) error {
func (task *Task) initializeCgroupResourceSpec(cgroupPath string, cGroupCPUPeriod time.Duration, resourceFields *taskresource.ResourceFields) error {
cgroupRoot, err := task.BuildCgroupRoot()
if err != nil {
return errors.Wrapf(err, "cgroup resource: unable to determine cgroup root for task")
}
resSpec, err := task.BuildLinuxResourceSpec()
resSpec, err := task.BuildLinuxResourceSpec(cGroupCPUPeriod)
if err != nil {
return errors.Wrapf(err, "cgroup resource: unable to build resource spec for task")
}
Expand Down Expand Up @@ -89,13 +88,13 @@ func (task *Task) BuildCgroupRoot() (string, error) {
}

// BuildLinuxResourceSpec returns a linuxResources object for the task cgroup
func (task *Task) BuildLinuxResourceSpec() (specs.LinuxResources, error) {
func (task *Task) BuildLinuxResourceSpec(cGroupCPUPeriod time.Duration) (specs.LinuxResources, error) {
linuxResourceSpec := specs.LinuxResources{}

// If task level CPU limits are requested, set CPU quota + CPU period
// Else set CPU shares
if task.CPU > 0 {
linuxCPUSpec, err := task.buildExplicitLinuxCPUSpec()
linuxCPUSpec, err := task.buildExplicitLinuxCPUSpec(cGroupCPUPeriod)
if err != nil {
return specs.LinuxResources{}, err
}
Expand All @@ -120,13 +119,13 @@ func (task *Task) BuildLinuxResourceSpec() (specs.LinuxResources, error) {

// buildExplicitLinuxCPUSpec builds CPU spec when task CPU limits are
// explicitly requested
func (task *Task) buildExplicitLinuxCPUSpec() (specs.LinuxCPU, error) {
func (task *Task) buildExplicitLinuxCPUSpec(cGroupCPUPeriod time.Duration) (specs.LinuxCPU, error) {
if task.CPU > maxTaskVCPULimit {
return specs.LinuxCPU{},
errors.Errorf("task CPU spec builder: unsupported CPU limits, requested=%f, max-supported=%d",
task.CPU, maxTaskVCPULimit)
}
taskCPUPeriod := uint64(defaultCPUPeriod / time.Microsecond)
taskCPUPeriod := uint64(cGroupCPUPeriod / time.Microsecond)
taskCPUQuota := int64(task.CPU * float64(taskCPUPeriod))

// TODO: DefaultCPUPeriod only permits 10VCPUs.
Expand Down
19 changes: 11 additions & 8 deletions agent/api/task/task_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ func TestBuildCgroupRootErrorPath(t *testing.T) {
// TestBuildLinuxResourceSpecCPUMem validates the linux resource spec builder
func TestBuildLinuxResourceSpecCPUMem(t *testing.T) {
taskMemoryLimit := int64(taskMemoryLimit)
defaultCPUPeriod := 100 * time.Millisecond

task := &Task{
Arn: validTaskArn,
Expand All @@ -131,14 +132,16 @@ func TestBuildLinuxResourceSpecCPUMem(t *testing.T) {
},
}

linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(defaultCPUPeriod)

assert.NoError(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
}

// TestBuildLinuxResourceSpecCPU validates the linux resource spec builder
func TestBuildLinuxResourceSpecCPU(t *testing.T) {
defaultCPUPeriod := 100 * time.Millisecond

task := &Task{
Arn: validTaskArn,
CPU: float64(taskVCPULimit),
Expand All @@ -153,7 +156,7 @@ func TestBuildLinuxResourceSpecCPU(t *testing.T) {
},
}

linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(defaultCPUPeriod)

assert.NoError(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand All @@ -176,7 +179,7 @@ func TestBuildLinuxResourceSpecWithoutTaskCPULimits(t *testing.T) {
},
}

linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(100 * time.Millisecond)

assert.NoError(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand All @@ -200,7 +203,7 @@ func TestBuildLinuxResourceSpecWithoutTaskCPUWithContainerCPULimits(t *testing.T
},
}

linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(100 * time.Millisecond)

assert.NoError(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand All @@ -223,7 +226,7 @@ func TestBuildLinuxResourceSpecInvalidMem(t *testing.T) {
}

expectedLinuxResourceSpec := specs.LinuxResources{}
linuxResourceSpec, err := task.BuildLinuxResourceSpec()
linuxResourceSpec, err := task.BuildLinuxResourceSpec(100 * time.Millisecond)

assert.Error(t, err)
assert.EqualValues(t, expectedLinuxResourceSpec, linuxResourceSpec)
Expand Down Expand Up @@ -403,7 +406,7 @@ func TestInitCgroupResourceSpecHappyPath(t *testing.T) {
defer ctrl.Finish()
mockControl := mock_control.NewMockControl(ctrl)
mockIO := mock_ioutilwrapper.NewMockIOUtil(ctrl)
assert.NoError(t, task.initializeCgroupResourceSpec("cgroupPath", &taskresource.ResourceFields{
assert.NoError(t, task.initializeCgroupResourceSpec("cgroupPath", 100*time.Millisecond, &taskresource.ResourceFields{
Control: mockControl,
ResourceFieldsCommon: &taskresource.ResourceFieldsCommon{
IOUtil: mockIO,
Expand All @@ -427,7 +430,7 @@ func TestInitCgroupResourceSpecInvalidARN(t *testing.T) {
MemoryCPULimitsEnabled: true,
ResourcesMapUnsafe: make(map[string][]taskresource.TaskResource),
}
assert.Error(t, task.initializeCgroupResourceSpec("", nil))
assert.Error(t, task.initializeCgroupResourceSpec("", time.Millisecond, nil))
assert.Equal(t, 0, len(task.GetResources()))
assert.Equal(t, 0, len(task.Containers[0].TransitionDependenciesMap))
}
Expand All @@ -448,7 +451,7 @@ func TestInitCgroupResourceSpecInvalidMem(t *testing.T) {
MemoryCPULimitsEnabled: true,
ResourcesMapUnsafe: make(map[string][]taskresource.TaskResource),
}
assert.Error(t, task.initializeCgroupResourceSpec("", nil))
assert.Error(t, task.initializeCgroupResourceSpec("", time.Millisecond, nil))
assert.Equal(t, 0, len(task.GetResources()))
assert.Equal(t, 0, len(task.Containers[0].TransitionDependenciesMap))
}
Expand Down
3 changes: 1 addition & 2 deletions agent/api/task/task_unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ const (
//and is maintained here for unix default. Also used for testing
memorySwappinessDefault = 0

defaultCPUPeriod = 100 * time.Millisecond // 100ms
// With a 100ms CPU period, we can express 0.01 vCPU to 10 vCPUs
maxTaskVCPULimit = 10
// Reference: http://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
Expand All @@ -50,7 +49,7 @@ func (task *Task) adjustForPlatform(cfg *config.Config) {

func getCanonicalPath(path string) string { return path }

func (task *Task) initializeCgroupResourceSpec(cgroupPath string, resourceFields *taskresource.ResourceFields) error {
func (task *Task) initializeCgroupResourceSpec(cgroupPath string, cGroupCPUPeriod time.Duration, resourceFields *taskresource.ResourceFields) error {
return nil
}

Expand Down
3 changes: 2 additions & 1 deletion agent/api/task/task_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"path/filepath"
"runtime"
"strings"
"time"

"github.com/aws/amazon-ecs-agent/agent/config"
"github.com/aws/amazon-ecs-agent/agent/taskresource"
Expand Down Expand Up @@ -133,6 +134,6 @@ func (task *Task) dockerCPUShares(containerCPU uint) int64 {
return int64(containerCPU)
}

func (task *Task) initializeCgroupResourceSpec(cgroupPath string, resourceFields *taskresource.ResourceFields) error {
func (task *Task) initializeCgroupResourceSpec(cgroupPath string, cGroupCPUPeriod time.Duration, resourceFields *taskresource.ResourceFields) error {
return errors.New("unsupported platform")
}
6 changes: 6 additions & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ const (

// DefaultTaskMetadataBurstRate is set to handle 60 burst requests at once
DefaultTaskMetadataBurstRate = 60

// DefaultCPUPeriod is set to 100 ms to set CFS period and quota for task limits
defaultCGroupCPUPeriod = 100 * time.Millisecond
maximumCGroupCPUPeriod = 100 * time.Millisecond
minimumCGroupCPUPeriod = 1 * time.Millisecond
)

const (
Expand Down Expand Up @@ -467,6 +472,7 @@ func environmentConfig() (Config, error) {
TaskMetadataSteadyStateRate: steadyStateRate,
TaskMetadataBurstRate: burstRate,
SharedVolumeMatchFullConfig: utils.ParseBool(os.Getenv("ECS_SHARED_VOLUME_MATCH_FULL_CONFIG"), false),
CGroupCPUPeriod: parseCGroupCPUPeriod(),
}, err
}

Expand Down
3 changes: 3 additions & 0 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,12 @@ func TestEnvironmentConfig(t *testing.T) {
defer setTestEnv("ECS_ENABLE_TASK_ENI", "true")()
defer setTestEnv("ECS_TASK_METADATA_RPS_LIMIT", "1000,1100")()
defer setTestEnv("ECS_SHARED_VOLUME_MATCH_FULL_CONFIG", "true")()
defer setTestEnv("ECS_CGROUP_CPU_PERIOD", "")
additionalLocalRoutesJSON := `["1.2.3.4/22","5.6.7.8/32"]`
setTestEnv("ECS_AWSVPC_ADDITIONAL_LOCAL_ROUTES", additionalLocalRoutesJSON)
setTestEnv("ECS_ENABLE_CONTAINER_METADATA", "true")
setTestEnv("ECS_HOST_DATA_DIR", "/etc/ecs/")
setTestEnv("ECS_CGROUP_CPU_PERIOD", "10ms")

conf, err := environmentConfig()
assert.NoError(t, err)
Expand Down Expand Up @@ -130,6 +132,7 @@ func TestEnvironmentConfig(t *testing.T) {
assert.Equal(t, 1000, conf.TaskMetadataSteadyStateRate)
assert.Equal(t, 1100, conf.TaskMetadataBurstRate)
assert.True(t, conf.SharedVolumeMatchFullConfig, "Wrong value for SharedVolumeMatchFullConfig")
assert.Equal(t, 10*time.Millisecond, conf.CGroupCPUPeriod)
}

func TestTrimWhitespaceWhenCreating(t *testing.T) {
Expand Down
1 change: 1 addition & 0 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ func DefaultConfig() Config {
TaskMetadataSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
TaskMetadataBurstRate: DefaultTaskMetadataBurstRate,
SharedVolumeMatchFullConfig: false, // only requiring shared volumes to match on name, which is default docker behavior
CGroupCPUPeriod: defaultCGroupCPUPeriod,
}
}

Expand Down
38 changes: 38 additions & 0 deletions agent/config/config_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, DefaultTaskMetadataBurstRate, cfg.TaskMetadataBurstRate,
"Default TaskMetadataBurstRate is set incorrectly")
assert.False(t, cfg.SharedVolumeMatchFullConfig, "Default SharedVolumeMatchFullConfig set incorrectly")
assert.Equal(t, defaultCGroupCPUPeriod, cfg.CGroupCPUPeriod, "CFS cpu period set incorrectly")
}

// TestConfigFromFile tests the configuration can be read from file
Expand Down Expand Up @@ -192,3 +193,40 @@ func setupFileConfiguration(t *testing.T, configContent string) string {

return file.Name()
}

func TestOverrideDefaultCPUPeriod(t *testing.T) {
defer setTestRegion()()
defer os.Setenv("ECS_CGROUP_CPU_PERIOD", "100ms")

os.Setenv("ECS_CGROUP_CPU_PERIOD", "10ms")
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, 10*time.Millisecond, conf.CGroupCPUPeriod, "Wrong value for CGroupCPUPeriod")
}

func TestDefaultCPUPeriod(t *testing.T) {
defer setTestRegion()()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, defaultCGroupCPUPeriod, conf.CGroupCPUPeriod, "Wrong value for CGroupCPUPeriod")
}

func TestCPUPeriodUpperBoundLimit(t *testing.T) {
defer setTestRegion()()
defer os.Setenv("ECS_CGROUP_CPU_PERIOD", "100ms")

os.Setenv("ECS_CGROUP_CPU_PERIOD", "110ms")
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, defaultCGroupCPUPeriod, conf.CGroupCPUPeriod, "Wrong value for CGroupCPUPeriod")
}

func TestCPUPeriodLowerBoundLimit(t *testing.T) {
defer setTestRegion()()
defer os.Setenv("ECS_CGROUP_CPU_PERIOD", "100ms")

os.Setenv("ECS_CGROUP_CPU_PERIOD", "100us")
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, defaultCGroupCPUPeriod, conf.CGroupCPUPeriod, "Wrong value for CGroupCPUPeriod")
}
21 changes: 21 additions & 0 deletions agent/config/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,24 @@ func parseEnvVariableDuration(envVar string) time.Duration {
}
return duration
}

func parseCGroupCPUPeriod() time.Duration {
envVal := os.Getenv("ECS_CGROUP_CPU_PERIOD")
if envVal == "" {
seelog.Debugf("Environment variable empty: ECS_CGROUP_CPU_PERIOD")
} else {
var err error
duration, err := time.ParseDuration(envVal)
if err != nil {
seelog.Warnf("Could not parse duration value: %v for Environment Variable ECS_CGROUP_CPU_PERIOD is not correct: %v, using default value instead", envVal, err)
} else {
if duration >= minimumCGroupCPUPeriod && duration <= maximumCGroupCPUPeriod {
return duration
} else {
seelog.Warnf("CPU Period duration value: %v for Environment Variable ECS_CGROUP_CPU_PERIOD is not within [%v, %v], using default value instead", envVal, minimumCGroupCPUPeriod, maximumCGroupCPUPeriod)
}
}
}

return defaultCGroupCPUPeriod
}
3 changes: 3 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,4 +233,7 @@ type Config struct {
// which ECS agent tries to register the instance where the instance id document is
// not available or needed
NoIID bool

// CGroupCPUPeriod is config option to set different CFS quota and period values in microsecond, defaults to 100 ms
CGroupCPUPeriod time.Duration
}

0 comments on commit afb836c

Please sign in to comment.