Skip to content
This repository has been archived by the owner on Jul 1, 2023. It is now read-only.

Commit

Permalink
Implement workaround to clean up leaking cgroups (#570)
Browse files Browse the repository at this point in the history
* Implement workaround to clean up leaking cgroups

This change implements a cleaner, that scans for cgroups created by
systemd-run --scope that do not have any pids assigned, indicating
that the cgroup is unused and should be cleaned up. On some systems
either due to systemd or the kernel, the scope is not being cleaned
up when the pids within the scope have completed execution, leading
to an eventual memory leak.

Kubernetes uses systemd-run --scope when creating mount points,
that may require drivers to be loaded/running in a separate context
from kubelet, which allows the above leak to occur.

kubernetes/kubernetes#70324
kubernetes/kubernetes#64137
gravitational/gravity#1219

* change logging level for cgroup cleanup

* address review feedback

* address review feedback

(cherry picked from commit 00ed8e6)
  • Loading branch information
Kevin Nisbet committed Mar 17, 2020
1 parent d2de692 commit d85094c
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tool/planet/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,8 @@ func runAgent(conf *agent.Config, monitoringConf *monitoring.Config, leaderConf
}
}

go runSystemdCgroupCleaner(ctx)

signalc := make(chan os.Signal, 2)
signal.Notify(signalc, os.Interrupt, syscall.SIGTERM)

Expand Down
148 changes: 148 additions & 0 deletions tool/planet/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,16 @@ package main

import (
"bytes"
"context"
"fmt"
"io/ioutil"
"os"
"path"
"path/filepath"
"runtime"
"strings"
"text/template"
"time"

systemdDbus "github.com/coreos/go-systemd/dbus"
"github.com/davecgh/go-spew/spew"
Expand Down Expand Up @@ -320,3 +324,147 @@ func u64(n uint64) *uint64 {
func i64(n int64) *int64 {
return &n
}

// runSystemdScopeCleaner implements a workaround for a kubernetes/systemd issue with cgroups/systemd scopes that leak
// under certain circumstances, usually when using a kubernetes cronjob with a mount. This appears to be mostly a
// systemd or kernel issue, where the pids running within the scope do not cause the scope to complete and clean up
// resulting in leaking memory.
// https://github.com/kubernetes/kubernetes/issues/70324
// https://github.com/kubernetes/kubernetes/issues/64137
// https://github.com/gravitational/gravity/issues/1219
//
// Kubernetes is using systemd-run --scope when creating mounts within systemd.
// https://github.com/gravitational/kubernetes/blob/1c045a09db662c6660562d88deff2733ca17dcf8/pkg/util/mount/mount_linux.go#L108-L131
//
// To clean up this leak, we want to scan for run-xxx.scope cgroups that do not have any processes, are atleast a minute
// old, and then have systemd remove scopes that do not hold any processes / where all processes have exited.
func runSystemdCgroupCleaner(ctx context.Context) {
ticker := time.NewTicker(15 * time.Minute)
defer ticker.Stop()

for {
select {
case <-ticker.C:
err := cleanSystemdScopes()
if err != nil {
logrus.WithError(err).Warn("Failed to clean systemd scopes that don't contain processes")
}
case <-ctx.Done():
return
}
}
}

func cleanSystemdScopes() error {
log := logrus.WithField(trace.Component, "cgroup-cleaner")

conn, err := systemdDbus.New()
if err != nil {
return trace.Wrap(err)
}
defer conn.Close()

var paths []string

baseTime := time.Now().Add(-time.Minute)

root := "/sys/fs/cgroup/systemd/"
err = filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
if err != nil {
return trace.ConvertSystemError(err)
}

// A run scope will have a directory name that looks something like run-r2343e8b13fd44b1297e241421fc1f6e3.scope
// We also want to protect against potential races, where the cgroup is created but doesn't have any pids
// added yet. So only consider paths that have existed for atleast a minute to be safe
if strings.HasPrefix(info.Name(), "run-") &&
strings.HasSuffix(info.Name(), ".scope") &&
baseTime.After(info.ModTime()) {
paths = append(paths, path)
}
return nil
})
if err != nil {
log.WithError(err).Warn("Error occurred while scanning cgroup hierarchy for unused systemd scopes.")
}

for _, path := range paths {
unitName := filepath.Base(path)
log := log.WithFields(logrus.Fields{
"path": path,
"unit": unitName,
})

// the cgroup virtual filesystem does not report file sizes, so we need to read the cgroup.procs file
// to see if there are any contents (any processes listed)
// http://man7.org/linux/man-pages/man7/cgroups.7.html
// The cgroup.procs file can be read to obtain a list of the processes
// that are members of a cgroup. The returned list of PIDs is not guar‐
// anteed to be in order. Nor is it guaranteed to be free of dupli‐
// cates. (For example, a PID may be recycled while reading from the
// list.)
procsPath := filepath.Join(path, "cgroup.procs")
pids, err := ioutil.ReadFile(procsPath)
if err != nil {
if !trace.IsNotFound(trace.ConvertSystemError(err)) {
log.WithError(err).Warn("Failed to read process list belonging to cgroup.")
}
continue
}

if len(pids) != 0 {
continue
}

_, err = conn.StopUnit(unitName, "replace", nil)
if err != nil {
log.WithError(err).Warn("Failed to stop systemd unit.")
continue
}

log.Info("Stopped systemd scope unit with no pids.")
}

return nil
}

/*
Extra Notes for Cgroup Cleanup.
The issue can be reproduced on centos 7.7.1908 with the following kubernetes config
apiVersion: v1
kind: Secret
metadata:
name: mysecret
type: Opaque
data:
username: YWRtaW4=
password: MWYyZDFlMmU2N2Rm
---
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: hello
spec:
schedule: "* * * * *"
jobTemplate:
spec:
template:
spec:
containers:
- name: hello
image: busybox
args:
- /bin/sh
- -c
- date; echo Hello from the Kubernetes clusterl; sleep 60
volumeMounts:
- name: foo
mountPath: "/etc/foo"
readOnly: true
restartPolicy: OnFailure
volumes:
- name: foo
secret:
secretName: mysecret
*/

0 comments on commit d85094c

Please sign in to comment.