Skip to content
This repository has been archived by the owner on May 12, 2021. It is now read-only.

Commit

Permalink
virtcontainers: reimplement sandbox cgroup
Browse files Browse the repository at this point in the history
All containers run in different cgroups even the sandbox, with this new
implementation the sandbox cpu cgroup wil be equal to the sum of all its
containers and the hypervisor process will be placed there impacting to the
containers running in the sandbox (VM). The default number of vcpus is
used when the sandbox has no constraints. For example, if default_vcpus
is 2, then quota will be 200000 and period 100000.

**c-ray test**
http://www.futuretech.blinkenlights.nl/c-ray.html

```
+=============================================+
|         | 6 threads 6cpus | 1 thread 1 cpu  |
+=============================================+
| current |   40 seconds    |   122 seconds   |
+==============================================
|   new   |   37 seconds    |   124 seconds   |
+==============================================
```

current = current cgroups implementation
new = new cgroups implementation

**workload**

```yaml
apiVersion: v1
kind: Pod
metadata:
  name: c-ray
  annotations:
    io.kubernetes.cri.untrusted-workload: "true"
spec:
  restartPolicy: Never
  containers:
  - name: c-ray-1
    image: docker.io/devimc/c-ray:latest
    imagePullPolicy: IfNotPresent
    args: ["-t", "6", "-s", "1600x1200", "-r", "8", "-i",
          "/c-ray-1.1/sphfract", "-o", "/tmp/output.ppm"]
    resources:
      limits:
        cpu: 6
  - name: c-ray-2
    image: docker.io/devimc/c-ray:latest
    imagePullPolicy: IfNotPresent
    args: ["-t", "1", "-s", "1600x1200", "-r", "8", "-i",
          "/c-ray-1.1/sphfract", "-o", "/tmp/output.ppm"]
    resources:
      limits:
        cpu: 1
```

fixes #1125

Signed-off-by: Julio Montes <[email protected]>
  • Loading branch information
Julio Montes committed Jan 28, 2019
1 parent 860dc10 commit 2da5814
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 148 deletions.
5 changes: 0 additions & 5 deletions virtcontainers/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,6 @@ func createSandboxFromConfig(ctx context.Context, sandboxConfig SandboxConfig, f
return nil, err
}

// Setup host cgroups
if err = s.setupCgroups(); err != nil {
return nil, err
}

return s, nil
}

Expand Down
314 changes: 184 additions & 130 deletions virtcontainers/cgroups.go
Original file line number Diff line number Diff line change
@@ -1,197 +1,251 @@
// Copyright (c) 2018 Huawei Corporation
// Copyright (c) 2019 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//

package virtcontainers

import (
"encoding/json"
"bufio"
"fmt"
"math"
"os"
"path/filepath"
"strings"

"github.com/containerd/cgroups"
"github.com/kata-containers/runtime/virtcontainers/pkg/annotations"
specs "github.com/opencontainers/runtime-spec/specs-go"
)

const (
vcpuGroupName = "vcpu"
defaultCgroupParent = "/kata"
)

type sandboxCgroups struct {
commonParent cgroups.Cgroup
sandboxSub cgroups.Cgroup
vcpuSub cgroups.Cgroup
type cgroupPather interface {
cgroups.Subsystem
Path(path string) string
}

func (s *Sandbox) newCgroups() error {
// New will still succeed when cgroup exists
// create common parent for all kata-containers
// e.g. /sys/fs/cgroup/cpu/vc
parent, err := cgroups.New(cgroups.V1,
cgroups.StaticPath(defaultCgroupParent), &specs.LinuxResources{})
if err != nil {
return fmt.Errorf("failed to create cgroup for %q", defaultCgroupParent)
}
// unconstrained cgroups are placed here.
// for example /sys/fs/cgroup/memory/kata/$CGPATH
// where path is defined by the containers manager
const cgroupKataPath = "/kata/"

// create sub-cgroup for each sandbox
// e.g. /sys/fs/cgroup/cpu/vc/<sandbox>
sandboxSub, err := parent.New(s.id, &specs.LinuxResources{})
// V1Constraints returns the cgroups that are compatible with th VC architecture
// and hypervisor, constraints can be applied to these cgroups.
func V1Constraints() ([]cgroups.Subsystem, error) {
root, err := cgroupV1MountPoint()
if err != nil {
return fmt.Errorf("failed to create cgroup for %s/%s", defaultCgroupParent, s.id)
return nil, err
}
subsystems := []cgroups.Subsystem{
cgroups.NewCputset(root),
cgroups.NewCpu(root),
cgroups.NewCpuacct(root),
}
return cgroupsSubsystems(subsystems)
}

// create sub-cgroup for vcpu threads
vcpuSub, err := sandboxSub.New(vcpuGroupName, &specs.LinuxResources{})
// V1NoConstraints returns the cgroups that are *not* compatible with th VC
// architecture and hypervisor, constraints MUST NOT be applied to these cgroups.
func V1NoConstraints() ([]cgroups.Subsystem, error) {
root, err := cgroupV1MountPoint()
if err != nil {
return fmt.Errorf("failed to create cgroup for %s/%s/%s", defaultCgroupParent, s.id, vcpuGroupName)
return nil, err
}
subsystems := []cgroups.Subsystem{
// Some constainers managers, like k8s, take the control of cgroups.
// k8s: the memory cgroup for the dns containers is small to place
// a hypervisor there.
cgroups.NewMemory(root),
}
return cgroupsSubsystems(subsystems)
}

s.cgroup = &sandboxCgroups{
commonParent: parent,
sandboxSub: sandboxSub,
vcpuSub: vcpuSub,
func cgroupsSubsystems(subsystems []cgroups.Subsystem) ([]cgroups.Subsystem, error) {
var enabled []cgroups.Subsystem
for _, s := range cgroupPathers(subsystems) {
// check and remove the default groups that do not exist
if _, err := os.Lstat(s.Path("/")); err == nil {
enabled = append(enabled, s)
}
}
return nil
return enabled, nil
}

func (s *Sandbox) destroyCgroups() error {
if s.cgroup == nil {
s.Logger().Warningf("cgroup is not initialized, no need to destroy")
return nil
func cgroupPathers(subystems []cgroups.Subsystem) []cgroupPather {
var out []cgroupPather
for _, s := range subystems {
if p, ok := s.(cgroupPather); ok {
out = append(out, p)
}
}
return out
}

// first move all processes in subgroup to parent in case live process blocks
// deletion of cgroup
if err := s.cgroup.sandboxSub.MoveTo(s.cgroup.commonParent); err != nil {
return fmt.Errorf("failed to clear cgroup processes")
// v1MountPoint returns the mount point where the cgroup
// mountpoints are mounted in a single hiearchy
func cgroupV1MountPoint() (string, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
if err := scanner.Err(); err != nil {
return "", err
}
var (
text = scanner.Text()
fields = strings.Split(text, " ")
// safe as mountinfo encodes mountpoints with spaces as \040.
index = strings.Index(text, " - ")
postSeparatorFields = strings.Fields(text[index+3:])
numPostFields = len(postSeparatorFields)
)
// this is an error as we can't detect if the mount is for "cgroup"
if numPostFields == 0 {
return "", fmt.Errorf("Found no fields post '-' in %q", text)
}
if postSeparatorFields[0] == "cgroup" {
// check that the mount is properly formated.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
return filepath.Dir(fields[4]), nil
}
}
return "", cgroups.ErrMountPointNotExist
}

return s.cgroup.sandboxSub.Delete()
func cgroupNoConstraintsPath(path string) string {
return fmt.Sprintf("%s/%s", cgroupKataPath, path)
}

func (s *Sandbox) setupCgroups() error {
if s.cgroup == nil {
return fmt.Errorf("failed to setup uninitialized cgroup for sandbox")
func (s *Sandbox) updateCgroups() error {
if s.state.CgroupPath == "" {
return fmt.Errorf("Cgroup path is empty")
}

resource, err := s.mergeSpecResource()
cgroup, err := cgroups.Load(V1Constraints, cgroups.StaticPath(s.state.CgroupPath))
if err != nil {
return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err)
}

if err := s.constrainHypervisor(cgroup); err != nil {
return err
}

if err := s.applyCPUCgroup(resource); err != nil {
if len(s.containers) <= 1 {
// nothing to update
return nil
}

resources, err := s.resources()
if err != nil {
return err
}

if err := cgroup.Update(&resources); err != nil {
return fmt.Errorf("Could not update cgroup %v: %v", s.state.CgroupPath, err)
}

return nil
}

func (s *Sandbox) applyCPUCgroup(rc *specs.LinuxResources) error {
if s.cgroup == nil {
return fmt.Errorf("failed to setup uninitialized cgroup for sandbox")
func (s *Sandbox) deleteCgroups() error {
path := cgroupNoConstraintsPath(s.state.CgroupPath)
noConstraintsCgroup, err := cgroups.Load(V1NoConstraints, cgroups.StaticPath(path))
if err != nil {
return fmt.Errorf("Could not load cgroup %v: %v", path, err)
}

// apply cpu constraint to vcpu cgroup
if err := s.cgroup.vcpuSub.Update(rc); err != nil {
return err
return noConstraintsCgroup.Delete()
}

func (s *Sandbox) constrainHypervisor(cgroup cgroups.Cgroup) error {
pid := s.hypervisor.pid()
if pid <= 0 {
return fmt.Errorf("Invalid hypervisor PID: %d", pid)
}

// when new container joins, new CPU could be hotplugged, so we
// have to query fresh vcpu info from hypervisor for every time.
tids, err := s.hypervisor.getThreadIDs()
resources := &specs.LinuxResources{}
path := cgroupNoConstraintsPath(s.state.CgroupPath)
noConstraintsCgroup, err := cgroups.New(V1NoConstraints, cgroups.StaticPath(path), resources)
if err != nil {
return fmt.Errorf("failed to get thread ids from hypervisor: %v", err)
}
if tids == nil {
// If there's no tid returned from the hypervisor, this is not
// a bug. It simply means there is nothing to constrain, hence
// let's return without any error from here.
return nil
return fmt.Errorf("Could not create cgroup %v: %v", path, err)
}

// use Add() to add vcpu thread to s.cgroup, it will write thread id to
// `cgroup.procs` which will move all threads in qemu process to this cgroup
// immediately as default behaviour.
if len(tids.vcpus) > 0 {
if err := s.cgroup.sandboxSub.Add(cgroups.Process{
Pid: tids.vcpus[0],
}); err != nil {
return err
}
if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil {
return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err)
}

for _, i := range tids.vcpus {
if i <= 0 {
continue
}
// Add qemu into cgroup
return cgroup.Add(cgroups.Process{Pid: pid})
}

// In contrast, AddTask will write thread id to `tasks`
// After this, vcpu threads are in "vcpu" sub-cgroup, other threads in
// qemu will be left in parent cgroup untouched.
if err := s.cgroup.vcpuSub.AddTask(cgroups.Process{
Pid: i,
}); err != nil {
return err
}
func (s *Sandbox) resources() (specs.LinuxResources, error) {
resources := specs.LinuxResources{
CPU: s.cpuResources(),
}

return nil
return resources, nil
}

func (s *Sandbox) mergeSpecResource() (*specs.LinuxResources, error) {
if s.config == nil {
return nil, fmt.Errorf("sandbox config is nil")
func (s *Sandbox) cpuResources() *specs.LinuxCPU {
quota := int64(0)
period := uint64(0)
shares := uint64(0)
cpus := ""

for _, c := range s.containers {
if s.state.Pid != c.process.Pid {
if c.state.Resources.CPU == nil {
continue
}

if c.state.Resources.CPU.Shares != nil {
shares = uint64(math.Max(float64(*c.state.Resources.CPU.Shares), float64(shares)))
}

if c.state.Resources.CPU.Quota != nil {
quota += *c.state.Resources.CPU.Quota
}

if c.state.Resources.CPU.Period != nil {
period = uint64(math.Max(float64(*c.state.Resources.CPU.Period), float64(period)))
}

if c.state.Resources.CPU.Cpus != "" {
cpus += c.state.Resources.CPU.Cpus + ","
}
}
}
cpus = strings.Trim(cpus, " \n\t,")

resource := &specs.LinuxResources{
CPU: &specs.LinuxCPU{},
cpu := &specs.LinuxCPU{}
if quota != int64(0) {
cpu.Quota = &quota
}

for _, c := range s.config.Containers {
config, ok := c.Annotations[annotations.ConfigJSONKey]
if !ok {
s.Logger().WithField("container", c.ID).Warningf("failed to find config from container annotations")
continue
}
if period != uint64(0) {
cpu.Period = &period
}

var spec specs.Spec
if err := json.Unmarshal([]byte(config), &spec); err != nil {
return nil, err
}
// use a default constraint for sandboxes without cpu constraints
if cpu.Period == nil && cpu.Quota == nil {
// set a quota and period equal to the default number of vcpus
quota = int64(s.config.HypervisorConfig.NumVCPUs) * 100000
period = 100000
cpu.Quota = &quota
cpu.Period = &period
}

// TODO: how to handle empty/unlimited resource?
// maybe we should add a default CPU/Memory delta when no
// resource limit is given. -- @WeiZhang555
if spec.Linux == nil || spec.Linux.Resources == nil {
continue
}
// calculate cpu quota and period
s.mergeCPUResource(resource, spec.Linux.Resources)
if shares != uint64(0) {
cpu.Shares = &shares
}
return resource, nil
}

func (s *Sandbox) mergeCPUResource(orig, rc *specs.LinuxResources) {
if orig.CPU == nil {
orig.CPU = &specs.LinuxCPU{}
}

if rc.CPU != nil && rc.CPU.Quota != nil && rc.CPU.Period != nil &&
*rc.CPU.Quota > 0 && *rc.CPU.Period > 0 {
if orig.CPU.Period == nil {
orig.CPU.Period = rc.CPU.Period
orig.CPU.Quota = rc.CPU.Quota
} else {
// this is an example to show how it works:
// container A and `orig` has quota: 5000 and period 10000
// here comes container B with quota 40 and period 100,
// so use previous period 10000 as a baseline, container B
// has proportional resource of quota 4000 and period 10000, calculated as
// delta := 40 / 100 * 10000 = 4000
// and final `*orig.CPU.Quota` = 5000 + 4000 = 9000
delta := float64(*rc.CPU.Quota) / float64(*rc.CPU.Period) * float64(*orig.CPU.Period)
*orig.CPU.Quota += int64(delta)
}
if cpus != "" {
cpu.Cpus = cpus
}

return cpu
}
Loading

0 comments on commit 2da5814

Please sign in to comment.