Skip to content
This repository has been archived by the owner on Jul 1, 2023. It is now read-only.

Implement workaround to clean up leaking cgroups #570

Merged
merged 4 commits into from
Mar 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tool/planet/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,8 @@ func runAgent(conf *agent.Config, monitoringConf *monitoring.Config, leaderConf
}
}

go runSystemdCgroupCleaner(ctx)

signalc := make(chan os.Signal, 2)
signal.Notify(signalc, os.Interrupt, syscall.SIGTERM)

Expand Down
148 changes: 148 additions & 0 deletions tool/planet/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,16 @@ package main

import (
"bytes"
"context"
"fmt"
"io/ioutil"
"os"
"path"
"path/filepath"
"runtime"
"strings"
"text/template"
"time"

systemdDbus "github.com/coreos/go-systemd/dbus"
"github.com/davecgh/go-spew/spew"
Expand Down Expand Up @@ -320,3 +324,147 @@ func u64(n uint64) *uint64 {
func i64(n int64) *int64 {
return &n
}

// runSystemdScopeCleaner implements a workaround for a kubernetes/systemd issue with cgroups/systemd scopes that leak
// under certain circumstances, usually when using a kubernetes cronjob with a mount. This appears to be mostly a
// systemd or kernel issue, where the pids running within the scope do not cause the scope to complete and clean up
// resulting in leaking memory.
// https://github.com/kubernetes/kubernetes/issues/70324
// https://github.com/kubernetes/kubernetes/issues/64137
// https://github.com/gravitational/gravity/issues/1219
//
// Kubernetes is using systemd-run --scope when creating mounts within systemd.
// https://github.com/gravitational/kubernetes/blob/1c045a09db662c6660562d88deff2733ca17dcf8/pkg/util/mount/mount_linux.go#L108-L131
//
// To clean up this leak, we want to scan for run-xxx.scope cgroups that do not have any processes, are atleast a minute
// old, and then have systemd remove scopes that do not hold any processes / where all processes have exited.
func runSystemdCgroupCleaner(ctx context.Context) {
ticker := time.NewTicker(15 * time.Minute)
knisbet marked this conversation as resolved.
Show resolved Hide resolved
defer ticker.Stop()

for {
select {
case <-ticker.C:
err := cleanSystemdScopes()
if err != nil {
logrus.WithError(err).Warn("Failed to clean systemd scopes that don't contain processes")
}
case <-ctx.Done():
return
}
}
}

func cleanSystemdScopes() error {
log := logrus.WithField(trace.Component, "cgroup-cleaner")

conn, err := systemdDbus.New()
if err != nil {
return trace.Wrap(err)
}
defer conn.Close()

var paths []string

baseTime := time.Now().Add(-time.Minute)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think a 1 minute interval is enough? Maybe, to be on a safe side, make it like an hour, or at least 10 minutes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking the likeliness of a race here is something like a few ns, maybe a few ms if the system is busy. This should be systemd creating a scope, which still start empty, and then placing the launched process into the particular cgroup. So the window is extremely tiny, and a minute is already overkill. If the logic here is incorrect and this stops scopes it shouldn't, I don't see much difference between stopping a scope that is 1 minute old and 1 hour old to cause problems, except that 1 minute old is hopefully a lot more apparent.

I suppose the potential for a false positive, is if a process inside planet specifically creates a scope, and then doesn't use it for a few minutes, that some other process will have manually come in and cleaned it up.

It would be possible to get the scope object from systemd first, and match specifically against Kubernetes mounts, but I was trying to keep the implementation relatively simple.


root := "/sys/fs/cgroup/systemd/"
err = filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
if err != nil {
return trace.ConvertSystemError(err)
}

// A run scope will have a directory name that looks something like run-r2343e8b13fd44b1297e241421fc1f6e3.scope
// We also want to protect against potential races, where the cgroup is created but doesn't have any pids
// added yet. So only consider paths that have existed for atleast a minute to be safe
if strings.HasPrefix(info.Name(), "run-") &&
strings.HasSuffix(info.Name(), ".scope") &&
baseTime.After(info.ModTime()) {
paths = append(paths, path)
}
return nil
})
if err != nil {
log.WithError(err).Warn("Error occurred while scanning cgroup hierarchy for unused systemd scopes.")
}

for _, path := range paths {
unitName := filepath.Base(path)
log := log.WithFields(logrus.Fields{
"path": path,
"unit": unitName,
})

// the cgroup virtual filesystem does not report file sizes, so we need to read the cgroup.procs file
// to see if there are any contents (any processes listed)
// http://man7.org/linux/man-pages/man7/cgroups.7.html
// The cgroup.procs file can be read to obtain a list of the processes
// that are members of a cgroup. The returned list of PIDs is not guar‐
// anteed to be in order. Nor is it guaranteed to be free of dupli‐
// cates. (For example, a PID may be recycled while reading from the
// list.)
procsPath := filepath.Join(path, "cgroup.procs")
pids, err := ioutil.ReadFile(procsPath)
if err != nil {
if !trace.IsNotFound(trace.ConvertSystemError(err)) {
log.WithError(err).Warn("Failed to read process list belonging to cgroup.")
}
continue
}

if len(pids) != 0 {
continue
}

_, err = conn.StopUnit(unitName, "replace", nil)
if err != nil {
log.WithError(err).Warn("Failed to stop systemd unit.")
continue
}
knisbet marked this conversation as resolved.
Show resolved Hide resolved

log.Info("Stopped systemd scope unit with no pids.")
}

return nil
}

/*
Extra Notes for Cgroup Cleanup.
The issue can be reproduced on centos 7.7.1908 with the following kubernetes config

apiVersion: v1
kind: Secret
metadata:
name: mysecret
type: Opaque
data:
username: YWRtaW4=
password: MWYyZDFlMmU2N2Rm
---
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: hello
spec:
schedule: "* * * * *"
jobTemplate:
spec:
template:
spec:
containers:
- name: hello
image: busybox
args:
- /bin/sh
- -c
- date; echo Hello from the Kubernetes clusterl; sleep 60
volumeMounts:
- name: foo
mountPath: "/etc/foo"
readOnly: true
restartPolicy: OnFailure
volumes:
- name: foo
secret:
secretName: mysecret
*/