Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cgroup: move proccesses from root group to new /init group #3242

Closed
wants to merge 11 commits into from
4 changes: 2 additions & 2 deletions pkg/agent/containerd/config_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ func setupContainerdConfig(ctx context.Context, cfg *config.Node) error {
}

isRunningInUserNS := userns.RunningInUserNS()
_, _, hasCFS, hasPIDs := cgroups.CheckCgroups()
cgroupsCheck := cgroups.CheckCgroups()
// "/sys/fs/cgroup" is namespaced
cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil
disableCgroup := isRunningInUserNS && (!hasCFS || !hasPIDs || !cgroupfsWritable)
disableCgroup := isRunningInUserNS && (!cgroupsCheck.HasCFS || !cgroupsCheck.HasPIDs || !cgroupfsWritable)
if disableCgroup {
logrus.Warn("cgroup v2 controllers are not delegated for rootless. Disabling cgroup.")
}
Expand Down
62 changes: 41 additions & 21 deletions pkg/cgroups/cgroups_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,20 @@ import (
"strings"

"github.com/containerd/cgroups"
cgroupsv2 "github.com/containerd/cgroups/v2"
cgroupsv2 "github.com/containerd/cgroups/v2" // used for cgroup2 evacuation, not specific to rootless mode
"github.com/rancher/k3s/pkg/version"
"github.com/sirupsen/logrus"
)

type CgroupCheck struct {
KubeletRoot string
RuntimeRoot string
HasCFS bool
HasPIDs bool
IsV2 bool
V2Evac bool // cgroupv2 needs evacuation of procs from /
}

func Validate() error {
if cgroups.Mode() == cgroups.Unified {
return validateCgroupsV2()
Expand Down Expand Up @@ -64,34 +73,45 @@ func validateCgroupsV2() error {
return nil
}

func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) {
cgroupsModeV2 := cgroups.Mode() == cgroups.Unified
func CheckCgroups() (check CgroupCheck) {
check.IsV2 = cgroups.Mode() == cgroups.Unified

// For Unified (v2) cgroups we can directly check to see what controllers are mounted
// under the unified hierarchy.
if cgroupsModeV2 {
m, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/")
if check.IsV2 {
cgroupRoot, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/")
if err != nil {
logrus.Errorf("Failed to load root cgroup: %+v", err)
return check
}

cgroupRootProcs, err := cgroupRoot.Procs(false)
if err != nil {
return "", "", false, false
return check
}
controllers, err := m.Controllers()

// if the root cgroup is not empty, we need to evacuate it
check.V2Evac = len(cgroupRootProcs) > 0

controllers, err := cgroupRoot.Controllers()
if err != nil {
return "", "", false, false
return check
}

// Intentionally using an expressionless switch to match the logic below
for _, controller := range controllers {
switch {
case controller == "cpu":
hasCFS = true
check.HasCFS = true
case controller == "pids":
hasPIDs = true
check.HasPIDs = true
}
}
}

f, err := os.Open("/proc/self/cgroup")
if err != nil {
return "", "", false, false
return check
}
defer f.Close()

Expand All @@ -106,7 +126,7 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) {
// For v2, controllers = {""} (only contains a single empty string)
for _, controller := range controllers {
switch {
case controller == "name=systemd" || cgroupsModeV2:
case controller == "name=systemd" || check.IsV2:
// If we detect that we are running under a `.scope` unit with systemd
// we can assume we are being directly invoked from the command line
// and thus need to set our kubelet root to something out of the context
Expand All @@ -118,7 +138,7 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) {
last := parts[len(parts)-1]
i := strings.LastIndex(last, ".scope")
if i > 0 {
kubeletRoot = "/" + version.Program
check.KubeletRoot = "/" + version.Program
}
case controller == "cpu":
// It is common for this to show up multiple times in /sys/fs/cgroup if the controllers are comounted:
Expand All @@ -127,25 +147,25 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) {
// can fail if we use the comma-separated name. Instead, we check for the controller using the symlink.
p := filepath.Join("/sys/fs/cgroup", controller, parts[2], "cpu.cfs_period_us")
if _, err := os.Stat(p); err == nil {
hasCFS = true
check.HasCFS = true
}
case controller == "pids":
hasPIDs = true
check.HasPIDs = true
}
}
}

// If we're running with v1 and didn't find a scope assigned by systemd, we need to create our own root cgroup to avoid
// just inheriting from the parent process. The kubelet will take care of moving us into it when we start it up later.
if kubeletRoot == "" {
if check.KubeletRoot == "" {
// Examine process ID 1 to see if there is a cgroup assigned to it.
// When we are not in a container, process 1 is likely to be systemd or some other service manager.
// It either lives at `/` or `/init.scope` according to https://man7.org/linux/man-pages/man7/systemd.special.7.html
// When containerized, process 1 will be generally be in a cgroup, otherwise, we may be running in
// a host PID scenario but we don't support this.
g, err := os.Open("/proc/1/cgroup")
if err != nil {
return "", "", false, false
return check
}
defer g.Close()
scan = bufio.NewScanner(g)
Expand All @@ -159,15 +179,15 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) {
// For v2, controllers = {""} (only contains a single empty string)
for _, controller := range controllers {
switch {
case controller == "name=systemd" || cgroupsModeV2:
case controller == "name=systemd" || check.IsV2:
last := parts[len(parts)-1]
if last != "/" && last != "/init.scope" {
kubeletRoot = "/" + version.Program
runtimeRoot = "/" + version.Program
check.KubeletRoot = "/" + version.Program
check.RuntimeRoot = "/" + version.Program
}
}
}
}
}
return kubeletRoot, runtimeRoot, hasCFS, hasPIDs
return check
}
13 changes: 12 additions & 1 deletion pkg/daemons/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ import (

"github.com/rancher/k3s/pkg/agent/config"
"github.com/rancher/k3s/pkg/agent/proxy"
"github.com/rancher/k3s/pkg/cgroups"
daemonconfig "github.com/rancher/k3s/pkg/daemons/config"
"github.com/rancher/k3s/pkg/daemons/executor"
"github.com/rootless-containers/rootlesskit/pkg/parent/cgrouputil" // used for cgroup2 evacuation, not specific to rootless mode
"github.com/sirupsen/logrus"
"k8s.io/component-base/logs"
_ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration
Expand Down Expand Up @@ -49,7 +51,16 @@ func startKubeProxy(ctx context.Context, cfg *daemonconfig.Agent) error {
}

func startKubelet(ctx context.Context, cfg *daemonconfig.Agent) error {
argsMap := kubeletArgs(cfg)
cgroupsCheck := cgroups.CheckCgroups()
if cgroupsCheck.V2Evac {
// evacuate processes from cgroup / to /init
if err := cgrouputil.EvacuateCgroup2("init"); err != nil {
logrus.Errorf("failed to evacuate cgroup2: %+v", err)
return err
}
}

argsMap := kubeletArgs(cfg, cgroupsCheck)

args := daemonconfig.GetArgs(argsMap, cfg.ExtraKubeletArgs)
logrus.Infof("Running kubelet %s", daemonconfig.ArgString(args))
Expand Down
17 changes: 8 additions & 9 deletions pkg/daemons/agent/agent_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func kubeProxyArgs(cfg *config.Agent) map[string]string {
return argsMap
}

func kubeletArgs(cfg *config.Agent) map[string]string {
func kubeletArgs(cfg *config.Agent, cgroupsInfo cgroups.CgroupCheck) map[string]string {
argsMap := map[string]string{
"healthz-bind-address": "127.0.0.1",
"read-only-port": "0",
Expand Down Expand Up @@ -133,19 +133,18 @@ func kubeletArgs(cfg *config.Agent) map[string]string {
if err != nil || defaultIP.String() != cfg.NodeIP {
argsMap["node-ip"] = cfg.NodeIP
}
kubeletRoot, runtimeRoot, hasCFS, hasPIDs := cgroups.CheckCgroups()
if !hasCFS {
if !cgroupsInfo.HasCFS {
logrus.Warn("Disabling CPU quotas due to missing cpu.cfs_period_us")
argsMap["cpu-cfs-quota"] = "false"
}
if !hasPIDs {
if !cgroupsInfo.HasPIDs {
logrus.Fatal("PIDS cgroup support not found")
}
if kubeletRoot != "" {
argsMap["kubelet-cgroups"] = kubeletRoot
if cgroupsInfo.KubeletRoot != "" {
argsMap["kubelet-cgroups"] = cgroupsInfo.KubeletRoot
}
if runtimeRoot != "" {
argsMap["runtime-cgroups"] = runtimeRoot
if cgroupsInfo.RuntimeRoot != "" {
argsMap["runtime-cgroups"] = cgroupsInfo.RuntimeRoot
}
if userns.RunningInUserNS() {
argsMap["feature-gates"] = util.AddFeatureGate(argsMap["feature-gates"], "DevicePlugins=false")
Expand All @@ -167,7 +166,7 @@ func kubeletArgs(cfg *config.Agent) map[string]string {
}

if cfg.Rootless {
createRootlessConfig(argsMap, hasCFS, hasCFS)
createRootlessConfig(argsMap, cgroupsInfo.HasCFS, cgroupsInfo.HasCFS)
}

if cfg.ProtectKernelDefaults {
Expand Down
26 changes: 3 additions & 23 deletions pkg/rootless/rootless.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"

"github.com/opencontainers/runc/libcontainer/cgroups"
Expand All @@ -19,14 +18,12 @@ import (
"github.com/rootless-containers/rootlesskit/pkg/parent"
portbuiltin "github.com/rootless-containers/rootlesskit/pkg/port/builtin"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)

var (
pipeFD = "_K3S_ROOTLESS_FD"
childEnv = "_K3S_ROOTLESS_SOCK"
evacuateCgroup2Env = "_K3S_ROOTLESS_EVACUATE_CGROUP2" // boolean
Sock = ""
pipeFD = "_K3S_ROOTLESS_FD"
childEnv = "_K3S_ROOTLESS_SOCK"
Sock = ""
)

func Rootless(stateDir string) error {
Expand Down Expand Up @@ -65,9 +62,6 @@ func Rootless(stateDir string) error {
}

os.Setenv(childEnv, filepath.Join(parentOpt.StateDir, parent.StateFileAPISock))
if parentOpt.EvacuateCgroup2 != "" {
os.Setenv(evacuateCgroup2Env, "1")
}
if err := parent.Parent(*parentOpt); err != nil {
logrus.Fatal(err)
}
Expand Down Expand Up @@ -144,13 +138,6 @@ func createParentOpt(stateDir string) (*parent.Opt, error) {
}
if selfCgroup2 := selfCgroupMap[""]; selfCgroup2 == "" {
logrus.Warnf("enabling cgroup2 is highly recommended, see https://rootlesscontaine.rs/getting-started/common/cgroup2/")
} else {
selfCgroup2Dir := filepath.Join("/sys/fs/cgroup", selfCgroup2)
if unix.Access(selfCgroup2Dir, unix.W_OK) == nil {
opt.EvacuateCgroup2 = "k3s_evac"
} else {
logrus.Warn("cannot set cgroup2 evacuation, make sure to run k3s as a systemd unit")
}
}

mtu := 0
Expand Down Expand Up @@ -198,12 +185,5 @@ func createChildOpt() (*child.Opt, error) {
opt.CopyUpDriver = tmpfssymlink.NewChildDriver()
opt.MountProcfs = true
opt.Reaper = true
if v := os.Getenv(evacuateCgroup2Env); v != "" {
var err error
opt.EvacuateCgroup2, err = strconv.ParseBool(v)
if err != nil {
return nil, err
}
}
return opt, nil
}