This repository has been archived by the owner on May 12, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 375
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
virtcontainers: reimplement sandbox cgroup
All containers run in different cgroups even the sandbox, with this new implementation the sandbox cpu cgroup wil be equal to the sum of all its containers and the hypervisor process will be placed there impacting to the containers running in the sandbox (VM). The default number of vcpus is used when the sandbox has no constraints. For example, if default_vcpus is 2, then quota will be 200000 and period 100000. **c-ray test** http://www.futuretech.blinkenlights.nl/c-ray.html ``` +=============================================+ | | 6 threads 6cpus | 1 thread 1 cpu | +=============================================+ | current | 40 seconds | 122 seconds | +============================================== | new | 37 seconds | 124 seconds | +============================================== ``` current = current cgroups implementation new = new cgroups implementation **workload** ```yaml apiVersion: v1 kind: Pod metadata: name: c-ray annotations: io.kubernetes.cri.untrusted-workload: "true" spec: restartPolicy: Never containers: - name: c-ray-1 image: docker.io/devimc/c-ray:latest imagePullPolicy: IfNotPresent args: ["-t", "6", "-s", "1600x1200", "-r", "8", "-i", "/c-ray-1.1/sphfract", "-o", "/tmp/output.ppm"] resources: limits: cpu: 6 - name: c-ray-2 image: docker.io/devimc/c-ray:latest imagePullPolicy: IfNotPresent args: ["-t", "1", "-s", "1600x1200", "-r", "8", "-i", "/c-ray-1.1/sphfract", "-o", "/tmp/output.ppm"] resources: limits: cpu: 1 ``` fixes #1125 Signed-off-by: Julio Montes <[email protected]>
- Loading branch information
Julio Montes
committed
Jan 28, 2019
1 parent
860dc10
commit 2da5814
Showing
3 changed files
with
218 additions
and
148 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,197 +1,251 @@ | ||
// Copyright (c) 2018 Huawei Corporation | ||
// Copyright (c) 2019 Intel Corporation | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
package virtcontainers | ||
|
||
import ( | ||
"encoding/json" | ||
"bufio" | ||
"fmt" | ||
"math" | ||
"os" | ||
"path/filepath" | ||
"strings" | ||
|
||
"github.com/containerd/cgroups" | ||
"github.com/kata-containers/runtime/virtcontainers/pkg/annotations" | ||
specs "github.com/opencontainers/runtime-spec/specs-go" | ||
) | ||
|
||
const ( | ||
vcpuGroupName = "vcpu" | ||
defaultCgroupParent = "/kata" | ||
) | ||
|
||
type sandboxCgroups struct { | ||
commonParent cgroups.Cgroup | ||
sandboxSub cgroups.Cgroup | ||
vcpuSub cgroups.Cgroup | ||
type cgroupPather interface { | ||
cgroups.Subsystem | ||
Path(path string) string | ||
} | ||
|
||
func (s *Sandbox) newCgroups() error { | ||
// New will still succeed when cgroup exists | ||
// create common parent for all kata-containers | ||
// e.g. /sys/fs/cgroup/cpu/vc | ||
parent, err := cgroups.New(cgroups.V1, | ||
cgroups.StaticPath(defaultCgroupParent), &specs.LinuxResources{}) | ||
if err != nil { | ||
return fmt.Errorf("failed to create cgroup for %q", defaultCgroupParent) | ||
} | ||
// unconstrained cgroups are placed here. | ||
// for example /sys/fs/cgroup/memory/kata/$CGPATH | ||
// where path is defined by the containers manager | ||
const cgroupKataPath = "/kata/" | ||
|
||
// create sub-cgroup for each sandbox | ||
// e.g. /sys/fs/cgroup/cpu/vc/<sandbox> | ||
sandboxSub, err := parent.New(s.id, &specs.LinuxResources{}) | ||
// V1Constraints returns the cgroups that are compatible with th VC architecture | ||
// and hypervisor, constraints can be applied to these cgroups. | ||
func V1Constraints() ([]cgroups.Subsystem, error) { | ||
root, err := cgroupV1MountPoint() | ||
if err != nil { | ||
return fmt.Errorf("failed to create cgroup for %s/%s", defaultCgroupParent, s.id) | ||
return nil, err | ||
} | ||
subsystems := []cgroups.Subsystem{ | ||
cgroups.NewCputset(root), | ||
cgroups.NewCpu(root), | ||
cgroups.NewCpuacct(root), | ||
} | ||
return cgroupsSubsystems(subsystems) | ||
} | ||
|
||
// create sub-cgroup for vcpu threads | ||
vcpuSub, err := sandboxSub.New(vcpuGroupName, &specs.LinuxResources{}) | ||
// V1NoConstraints returns the cgroups that are *not* compatible with th VC | ||
// architecture and hypervisor, constraints MUST NOT be applied to these cgroups. | ||
func V1NoConstraints() ([]cgroups.Subsystem, error) { | ||
root, err := cgroupV1MountPoint() | ||
if err != nil { | ||
return fmt.Errorf("failed to create cgroup for %s/%s/%s", defaultCgroupParent, s.id, vcpuGroupName) | ||
return nil, err | ||
} | ||
subsystems := []cgroups.Subsystem{ | ||
// Some constainers managers, like k8s, take the control of cgroups. | ||
// k8s: the memory cgroup for the dns containers is small to place | ||
// a hypervisor there. | ||
cgroups.NewMemory(root), | ||
} | ||
return cgroupsSubsystems(subsystems) | ||
} | ||
|
||
s.cgroup = &sandboxCgroups{ | ||
commonParent: parent, | ||
sandboxSub: sandboxSub, | ||
vcpuSub: vcpuSub, | ||
func cgroupsSubsystems(subsystems []cgroups.Subsystem) ([]cgroups.Subsystem, error) { | ||
var enabled []cgroups.Subsystem | ||
for _, s := range cgroupPathers(subsystems) { | ||
// check and remove the default groups that do not exist | ||
if _, err := os.Lstat(s.Path("/")); err == nil { | ||
enabled = append(enabled, s) | ||
} | ||
} | ||
return nil | ||
return enabled, nil | ||
} | ||
|
||
func (s *Sandbox) destroyCgroups() error { | ||
if s.cgroup == nil { | ||
s.Logger().Warningf("cgroup is not initialized, no need to destroy") | ||
return nil | ||
func cgroupPathers(subystems []cgroups.Subsystem) []cgroupPather { | ||
var out []cgroupPather | ||
for _, s := range subystems { | ||
if p, ok := s.(cgroupPather); ok { | ||
out = append(out, p) | ||
} | ||
} | ||
return out | ||
} | ||
|
||
// first move all processes in subgroup to parent in case live process blocks | ||
// deletion of cgroup | ||
if err := s.cgroup.sandboxSub.MoveTo(s.cgroup.commonParent); err != nil { | ||
return fmt.Errorf("failed to clear cgroup processes") | ||
// v1MountPoint returns the mount point where the cgroup | ||
// mountpoints are mounted in a single hiearchy | ||
func cgroupV1MountPoint() (string, error) { | ||
f, err := os.Open("/proc/self/mountinfo") | ||
if err != nil { | ||
return "", err | ||
} | ||
defer f.Close() | ||
scanner := bufio.NewScanner(f) | ||
for scanner.Scan() { | ||
if err := scanner.Err(); err != nil { | ||
return "", err | ||
} | ||
var ( | ||
text = scanner.Text() | ||
fields = strings.Split(text, " ") | ||
// safe as mountinfo encodes mountpoints with spaces as \040. | ||
index = strings.Index(text, " - ") | ||
postSeparatorFields = strings.Fields(text[index+3:]) | ||
numPostFields = len(postSeparatorFields) | ||
) | ||
// this is an error as we can't detect if the mount is for "cgroup" | ||
if numPostFields == 0 { | ||
return "", fmt.Errorf("Found no fields post '-' in %q", text) | ||
} | ||
if postSeparatorFields[0] == "cgroup" { | ||
// check that the mount is properly formated. | ||
if numPostFields < 3 { | ||
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) | ||
} | ||
return filepath.Dir(fields[4]), nil | ||
} | ||
} | ||
return "", cgroups.ErrMountPointNotExist | ||
} | ||
|
||
return s.cgroup.sandboxSub.Delete() | ||
func cgroupNoConstraintsPath(path string) string { | ||
return fmt.Sprintf("%s/%s", cgroupKataPath, path) | ||
} | ||
|
||
func (s *Sandbox) setupCgroups() error { | ||
if s.cgroup == nil { | ||
return fmt.Errorf("failed to setup uninitialized cgroup for sandbox") | ||
func (s *Sandbox) updateCgroups() error { | ||
if s.state.CgroupPath == "" { | ||
return fmt.Errorf("Cgroup path is empty") | ||
} | ||
|
||
resource, err := s.mergeSpecResource() | ||
cgroup, err := cgroups.Load(V1Constraints, cgroups.StaticPath(s.state.CgroupPath)) | ||
if err != nil { | ||
return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err) | ||
} | ||
|
||
if err := s.constrainHypervisor(cgroup); err != nil { | ||
return err | ||
} | ||
|
||
if err := s.applyCPUCgroup(resource); err != nil { | ||
if len(s.containers) <= 1 { | ||
// nothing to update | ||
return nil | ||
} | ||
|
||
resources, err := s.resources() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
if err := cgroup.Update(&resources); err != nil { | ||
return fmt.Errorf("Could not update cgroup %v: %v", s.state.CgroupPath, err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func (s *Sandbox) applyCPUCgroup(rc *specs.LinuxResources) error { | ||
if s.cgroup == nil { | ||
return fmt.Errorf("failed to setup uninitialized cgroup for sandbox") | ||
func (s *Sandbox) deleteCgroups() error { | ||
path := cgroupNoConstraintsPath(s.state.CgroupPath) | ||
noConstraintsCgroup, err := cgroups.Load(V1NoConstraints, cgroups.StaticPath(path)) | ||
if err != nil { | ||
return fmt.Errorf("Could not load cgroup %v: %v", path, err) | ||
} | ||
|
||
// apply cpu constraint to vcpu cgroup | ||
if err := s.cgroup.vcpuSub.Update(rc); err != nil { | ||
return err | ||
return noConstraintsCgroup.Delete() | ||
} | ||
|
||
func (s *Sandbox) constrainHypervisor(cgroup cgroups.Cgroup) error { | ||
pid := s.hypervisor.pid() | ||
if pid <= 0 { | ||
return fmt.Errorf("Invalid hypervisor PID: %d", pid) | ||
} | ||
|
||
// when new container joins, new CPU could be hotplugged, so we | ||
// have to query fresh vcpu info from hypervisor for every time. | ||
tids, err := s.hypervisor.getThreadIDs() | ||
resources := &specs.LinuxResources{} | ||
path := cgroupNoConstraintsPath(s.state.CgroupPath) | ||
noConstraintsCgroup, err := cgroups.New(V1NoConstraints, cgroups.StaticPath(path), resources) | ||
if err != nil { | ||
return fmt.Errorf("failed to get thread ids from hypervisor: %v", err) | ||
} | ||
if tids == nil { | ||
// If there's no tid returned from the hypervisor, this is not | ||
// a bug. It simply means there is nothing to constrain, hence | ||
// let's return without any error from here. | ||
return nil | ||
return fmt.Errorf("Could not create cgroup %v: %v", path, err) | ||
} | ||
|
||
// use Add() to add vcpu thread to s.cgroup, it will write thread id to | ||
// `cgroup.procs` which will move all threads in qemu process to this cgroup | ||
// immediately as default behaviour. | ||
if len(tids.vcpus) > 0 { | ||
if err := s.cgroup.sandboxSub.Add(cgroups.Process{ | ||
Pid: tids.vcpus[0], | ||
}); err != nil { | ||
return err | ||
} | ||
if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil { | ||
return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err) | ||
} | ||
|
||
for _, i := range tids.vcpus { | ||
if i <= 0 { | ||
continue | ||
} | ||
// Add qemu into cgroup | ||
return cgroup.Add(cgroups.Process{Pid: pid}) | ||
} | ||
|
||
// In contrast, AddTask will write thread id to `tasks` | ||
// After this, vcpu threads are in "vcpu" sub-cgroup, other threads in | ||
// qemu will be left in parent cgroup untouched. | ||
if err := s.cgroup.vcpuSub.AddTask(cgroups.Process{ | ||
Pid: i, | ||
}); err != nil { | ||
return err | ||
} | ||
func (s *Sandbox) resources() (specs.LinuxResources, error) { | ||
resources := specs.LinuxResources{ | ||
CPU: s.cpuResources(), | ||
} | ||
|
||
return nil | ||
return resources, nil | ||
} | ||
|
||
func (s *Sandbox) mergeSpecResource() (*specs.LinuxResources, error) { | ||
if s.config == nil { | ||
return nil, fmt.Errorf("sandbox config is nil") | ||
func (s *Sandbox) cpuResources() *specs.LinuxCPU { | ||
quota := int64(0) | ||
period := uint64(0) | ||
shares := uint64(0) | ||
cpus := "" | ||
|
||
for _, c := range s.containers { | ||
if s.state.Pid != c.process.Pid { | ||
if c.state.Resources.CPU == nil { | ||
continue | ||
} | ||
|
||
if c.state.Resources.CPU.Shares != nil { | ||
shares = uint64(math.Max(float64(*c.state.Resources.CPU.Shares), float64(shares))) | ||
} | ||
|
||
if c.state.Resources.CPU.Quota != nil { | ||
quota += *c.state.Resources.CPU.Quota | ||
} | ||
|
||
if c.state.Resources.CPU.Period != nil { | ||
period = uint64(math.Max(float64(*c.state.Resources.CPU.Period), float64(period))) | ||
} | ||
|
||
if c.state.Resources.CPU.Cpus != "" { | ||
cpus += c.state.Resources.CPU.Cpus + "," | ||
} | ||
} | ||
} | ||
cpus = strings.Trim(cpus, " \n\t,") | ||
|
||
resource := &specs.LinuxResources{ | ||
CPU: &specs.LinuxCPU{}, | ||
cpu := &specs.LinuxCPU{} | ||
if quota != int64(0) { | ||
cpu.Quota = "a | ||
} | ||
|
||
for _, c := range s.config.Containers { | ||
config, ok := c.Annotations[annotations.ConfigJSONKey] | ||
if !ok { | ||
s.Logger().WithField("container", c.ID).Warningf("failed to find config from container annotations") | ||
continue | ||
} | ||
if period != uint64(0) { | ||
cpu.Period = &period | ||
} | ||
|
||
var spec specs.Spec | ||
if err := json.Unmarshal([]byte(config), &spec); err != nil { | ||
return nil, err | ||
} | ||
// use a default constraint for sandboxes without cpu constraints | ||
if cpu.Period == nil && cpu.Quota == nil { | ||
// set a quota and period equal to the default number of vcpus | ||
quota = int64(s.config.HypervisorConfig.NumVCPUs) * 100000 | ||
period = 100000 | ||
cpu.Quota = "a | ||
cpu.Period = &period | ||
} | ||
|
||
// TODO: how to handle empty/unlimited resource? | ||
// maybe we should add a default CPU/Memory delta when no | ||
// resource limit is given. -- @WeiZhang555 | ||
if spec.Linux == nil || spec.Linux.Resources == nil { | ||
continue | ||
} | ||
// calculate cpu quota and period | ||
s.mergeCPUResource(resource, spec.Linux.Resources) | ||
if shares != uint64(0) { | ||
cpu.Shares = &shares | ||
} | ||
return resource, nil | ||
} | ||
|
||
func (s *Sandbox) mergeCPUResource(orig, rc *specs.LinuxResources) { | ||
if orig.CPU == nil { | ||
orig.CPU = &specs.LinuxCPU{} | ||
} | ||
|
||
if rc.CPU != nil && rc.CPU.Quota != nil && rc.CPU.Period != nil && | ||
*rc.CPU.Quota > 0 && *rc.CPU.Period > 0 { | ||
if orig.CPU.Period == nil { | ||
orig.CPU.Period = rc.CPU.Period | ||
orig.CPU.Quota = rc.CPU.Quota | ||
} else { | ||
// this is an example to show how it works: | ||
// container A and `orig` has quota: 5000 and period 10000 | ||
// here comes container B with quota 40 and period 100, | ||
// so use previous period 10000 as a baseline, container B | ||
// has proportional resource of quota 4000 and period 10000, calculated as | ||
// delta := 40 / 100 * 10000 = 4000 | ||
// and final `*orig.CPU.Quota` = 5000 + 4000 = 9000 | ||
delta := float64(*rc.CPU.Quota) / float64(*rc.CPU.Period) * float64(*orig.CPU.Period) | ||
*orig.CPU.Quota += int64(delta) | ||
} | ||
if cpus != "" { | ||
cpu.Cpus = cpus | ||
} | ||
|
||
return cpu | ||
} |
Oops, something went wrong.