From 00fe31338aebbba1b2669e5372d334a4bf47b4aa Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Wed, 28 Apr 2021 10:12:35 +0200 Subject: [PATCH 01/10] cgroup: move proccesses from root group to new /init group - Ref: see discussion at https://github.com/k3s-io/k3s/pull/3237 Signed-off-by: Thorsten Klein --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 1d249ea28fd1..cba085b428fc 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,5 @@ __pycache__ /tests/.vscode /sonobuoy-output *.tmp +/.vscode +/.local From 91372915107137fbf72b437a47582be45c492ae8 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Thu, 6 May 2021 20:00:32 +0200 Subject: [PATCH 02/10] use cgrouputil.ecavuatecgroup2 instead of custom code Signed-off-by: Thorsten Klein --- pkg/daemons/agent/agent.go | 6 ++++++ pkg/rootless/rootless.go | 8 -------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index a03f4d1520bc..93718c8858c9 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -7,6 +7,12 @@ import ( "github.com/rancher/k3s/pkg/daemons/config" "github.com/rancher/k3s/pkg/daemons/executor" +<<<<<<< HEAD +======= + "github.com/rancher/k3s/pkg/util" + "github.com/rancher/k3s/pkg/version" + "github.com/rootless-containers/rootlesskit/pkg/parent/cgrouputil" +>>>>>>> cee14fa1dd (use cgrouputil.ecavuatecgroup2 instead of custom code) "github.com/sirupsen/logrus" "k8s.io/component-base/logs" _ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration diff --git a/pkg/rootless/rootless.go b/pkg/rootless/rootless.go index d81ed00b3b2b..0ad2dcab4559 100644 --- a/pkg/rootless/rootless.go +++ b/pkg/rootless/rootless.go @@ -19,7 +19,6 @@ import ( "github.com/rootless-containers/rootlesskit/pkg/parent" portbuiltin "github.com/rootless-containers/rootlesskit/pkg/port/builtin" "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) var ( @@ -144,13 +143,6 @@ func createParentOpt(stateDir string) (*parent.Opt, error) { } if selfCgroup2 := selfCgroupMap[""]; selfCgroup2 == "" { logrus.Warnf("enabling cgroup2 is highly recommended, see https://rootlesscontaine.rs/getting-started/common/cgroup2/") - } else { - selfCgroup2Dir := filepath.Join("/sys/fs/cgroup", selfCgroup2) - if unix.Access(selfCgroup2Dir, unix.W_OK) == nil { - opt.EvacuateCgroup2 = "k3s_evac" - } else { - logrus.Warn("cannot set cgroup2 evacuation, make sure to run k3s as a systemd unit") - } } mtu := 0 From b6b495f1a6afd3249b77ea4e9397afd27c6107ff Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Fri, 7 May 2021 08:55:03 +0200 Subject: [PATCH 03/10] Drop evacuation opt in subprocesses and move evacuation out of checkcgroups Signed-off-by: Thorsten Klein --- pkg/daemons/agent/agent.go | 8 +------- pkg/rootless/rootless.go | 11 ----------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index 93718c8858c9..fd6ed1173ae6 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -6,13 +6,7 @@ import ( "time" "github.com/rancher/k3s/pkg/daemons/config" - "github.com/rancher/k3s/pkg/daemons/executor" -<<<<<<< HEAD -======= - "github.com/rancher/k3s/pkg/util" - "github.com/rancher/k3s/pkg/version" - "github.com/rootless-containers/rootlesskit/pkg/parent/cgrouputil" ->>>>>>> cee14fa1dd (use cgrouputil.ecavuatecgroup2 instead of custom code) + "github.com/rancher/k3s/pkg/daemons/executor" // used for cgroup2 evacuation, not specific to rootless mode "github.com/sirupsen/logrus" "k8s.io/component-base/logs" _ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration diff --git a/pkg/rootless/rootless.go b/pkg/rootless/rootless.go index 0ad2dcab4559..c5bb765c646d 100644 --- a/pkg/rootless/rootless.go +++ b/pkg/rootless/rootless.go @@ -8,7 +8,6 @@ import ( "os" "os/exec" "path/filepath" - "strconv" "strings" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -64,9 +63,6 @@ func Rootless(stateDir string) error { } os.Setenv(childEnv, filepath.Join(parentOpt.StateDir, parent.StateFileAPISock)) - if parentOpt.EvacuateCgroup2 != "" { - os.Setenv(evacuateCgroup2Env, "1") - } if err := parent.Parent(*parentOpt); err != nil { logrus.Fatal(err) } @@ -190,12 +186,5 @@ func createChildOpt() (*child.Opt, error) { opt.CopyUpDriver = tmpfssymlink.NewChildDriver() opt.MountProcfs = true opt.Reaper = true - if v := os.Getenv(evacuateCgroup2Env); v != "" { - var err error - opt.EvacuateCgroup2, err = strconv.ParseBool(v) - if err != nil { - return nil, err - } - } return opt, nil } From 07c2cc6cc5e06f80995ad6de23d1286161aeed0f Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Fri, 7 May 2021 09:12:06 +0200 Subject: [PATCH 04/10] drop unused k3s_rootless_evacuate_cgroup2 env var definition Signed-off-by: Thorsten Klein --- pkg/rootless/rootless.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/rootless/rootless.go b/pkg/rootless/rootless.go index c5bb765c646d..633c22838317 100644 --- a/pkg/rootless/rootless.go +++ b/pkg/rootless/rootless.go @@ -21,10 +21,9 @@ import ( ) var ( - pipeFD = "_K3S_ROOTLESS_FD" - childEnv = "_K3S_ROOTLESS_SOCK" - evacuateCgroup2Env = "_K3S_ROOTLESS_EVACUATE_CGROUP2" // boolean - Sock = "" + pipeFD = "_K3S_ROOTLESS_FD" + childEnv = "_K3S_ROOTLESS_SOCK" + Sock = "" ) func Rootless(stateDir string) error { From 85838c3da6cab8fd1388e365920048ebc9073009 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Fri, 7 May 2021 09:32:48 +0200 Subject: [PATCH 05/10] move unnecessary gitignore items to local gitignore Signed-off-by: Thorsten Klein --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index cba085b428fc..1d249ea28fd1 100644 --- a/.gitignore +++ b/.gitignore @@ -29,5 +29,3 @@ __pycache__ /tests/.vscode /sonobuoy-output *.tmp -/.vscode -/.local From 20afcbf61d4e0138cad2d8ef3e1c250484675bec Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Fri, 7 May 2021 10:11:32 +0200 Subject: [PATCH 06/10] use a struct for cgroupscheck results Signed-off-by: Thorsten Klein --- pkg/daemons/agent/agent.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index fd6ed1173ae6..611e61a45b1d 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -18,6 +18,15 @@ const ( windowsPrefix = "npipe://" ) +type CgroupCheck struct { + KubeletRoot string + RuntimeRoot string + HasCFS bool + HasPIDs bool + IsV2 bool + V2Evac bool // cgroupv2 needs evacuation of procs from / +} + func Agent(config *config.Agent) error { rand.Seed(time.Now().UTC().UnixNano()) From 57a3bbe98ac71dd85bb1195e260d0422d36502d7 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Tue, 20 Jul 2021 09:02:32 +0200 Subject: [PATCH 07/10] cgroupsv2: revamp PR based on changes in master Signed-off-by: Thorsten Klein --- pkg/agent/containerd/config_linux.go | 4 +- pkg/cgroups/cgroups_linux.go | 62 ++++++++++++++++++---------- pkg/daemons/agent/agent.go | 24 ++++++----- pkg/daemons/agent/agent_linux.go | 17 ++++---- 4 files changed, 64 insertions(+), 43 deletions(-) diff --git a/pkg/agent/containerd/config_linux.go b/pkg/agent/containerd/config_linux.go index 936e4fdcdaba..25e257dd1ad4 100644 --- a/pkg/agent/containerd/config_linux.go +++ b/pkg/agent/containerd/config_linux.go @@ -43,10 +43,10 @@ func setupContainerdConfig(ctx context.Context, cfg *config.Node) error { } isRunningInUserNS := system.RunningInUserNS() - _, _, hasCFS, hasPIDs := cgroups.CheckCgroups() + cgroupsCheck := cgroups.CheckCgroups() // "/sys/fs/cgroup" is namespaced cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil - disableCgroup := isRunningInUserNS && (!hasCFS || !hasPIDs || !cgroupfsWritable) + disableCgroup := isRunningInUserNS && (!cgroupsCheck.HasCFS || !cgroupsCheck.HasPIDs || !cgroupfsWritable) if disableCgroup { logrus.Warn("cgroup v2 controllers are not delegated for rootless. Disabling cgroup.") } diff --git a/pkg/cgroups/cgroups_linux.go b/pkg/cgroups/cgroups_linux.go index 3f97ba915c80..467db48fd63a 100644 --- a/pkg/cgroups/cgroups_linux.go +++ b/pkg/cgroups/cgroups_linux.go @@ -12,11 +12,20 @@ import ( "strings" "github.com/containerd/cgroups" - cgroupsv2 "github.com/containerd/cgroups/v2" + cgroupsv2 "github.com/containerd/cgroups/v2" // used for cgroup2 evacuation, not specific to rootless mode "github.com/rancher/k3s/pkg/version" "github.com/sirupsen/logrus" ) +type CgroupCheck struct { + KubeletRoot string + RuntimeRoot string + HasCFS bool + HasPIDs bool + IsV2 bool + V2Evac bool // cgroupv2 needs evacuation of procs from / +} + func Validate() error { if cgroups.Mode() == cgroups.Unified { return validateCgroupsV2() @@ -64,34 +73,45 @@ func validateCgroupsV2() error { return nil } -func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { - cgroupsModeV2 := cgroups.Mode() == cgroups.Unified +func CheckCgroups() (check CgroupCheck) { + check.IsV2 = cgroups.Mode() == cgroups.Unified // For Unified (v2) cgroups we can directly check to see what controllers are mounted // under the unified hierarchy. - if cgroupsModeV2 { - m, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/") + if check.IsV2 { + cgroupRoot, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/") + if err != nil { + logrus.Errorf("Failed to load root cgroup: %+v", err) + return check + } + + cgroupRootProcs, err := cgroupRoot.Procs(false) if err != nil { - return "", "", false, false + return check } - controllers, err := m.Controllers() + + // if the root cgroup is not empty, we need to evacuate it + check.V2Evac = len(cgroupRootProcs) > 0 + + controllers, err := cgroupRoot.Controllers() if err != nil { - return "", "", false, false + return check } + // Intentionally using an expressionless switch to match the logic below for _, controller := range controllers { switch { case controller == "cpu": - hasCFS = true + check.HasCFS = true case controller == "pids": - hasPIDs = true + check.HasPIDs = true } } } f, err := os.Open("/proc/self/cgroup") if err != nil { - return "", "", false, false + return check } defer f.Close() @@ -106,7 +126,7 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { // For v2, controllers = {""} (only contains a single empty string) for _, controller := range controllers { switch { - case controller == "name=systemd" || cgroupsModeV2: + case controller == "name=systemd" || check.IsV2: // If we detect that we are running under a `.scope` unit with systemd // we can assume we are being directly invoked from the command line // and thus need to set our kubelet root to something out of the context @@ -118,7 +138,7 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { last := parts[len(parts)-1] i := strings.LastIndex(last, ".scope") if i > 0 { - kubeletRoot = "/" + version.Program + check.KubeletRoot = "/" + version.Program } case controller == "cpu": // It is common for this to show up multiple times in /sys/fs/cgroup if the controllers are comounted: @@ -127,17 +147,17 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { // can fail if we use the comma-separated name. Instead, we check for the controller using the symlink. p := filepath.Join("/sys/fs/cgroup", controller, parts[2], "cpu.cfs_period_us") if _, err := os.Stat(p); err == nil { - hasCFS = true + check.HasCFS = true } case controller == "pids": - hasPIDs = true + check.HasPIDs = true } } } // If we're running with v1 and didn't find a scope assigned by systemd, we need to create our own root cgroup to avoid // just inheriting from the parent process. The kubelet will take care of moving us into it when we start it up later. - if kubeletRoot == "" { + if check.KubeletRoot == "" { // Examine process ID 1 to see if there is a cgroup assigned to it. // When we are not in a container, process 1 is likely to be systemd or some other service manager. // It either lives at `/` or `/init.scope` according to https://man7.org/linux/man-pages/man7/systemd.special.7.html @@ -145,7 +165,7 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { // a host PID scenario but we don't support this. g, err := os.Open("/proc/1/cgroup") if err != nil { - return "", "", false, false + return check } defer g.Close() scan = bufio.NewScanner(g) @@ -159,15 +179,15 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { // For v2, controllers = {""} (only contains a single empty string) for _, controller := range controllers { switch { - case controller == "name=systemd" || cgroupsModeV2: + case controller == "name=systemd" || check.IsV2: last := parts[len(parts)-1] if last != "/" && last != "/init.scope" { - kubeletRoot = "/" + version.Program - runtimeRoot = "/" + version.Program + check.KubeletRoot = "/" + version.Program + check.RuntimeRoot = "/" + version.Program } } } } } - return kubeletRoot, runtimeRoot, hasCFS, hasPIDs + return check } diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index 611e61a45b1d..9508ac4e66d4 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -5,8 +5,10 @@ import ( "os" "time" + "github.com/rancher/k3s/pkg/cgroups" "github.com/rancher/k3s/pkg/daemons/config" - "github.com/rancher/k3s/pkg/daemons/executor" // used for cgroup2 evacuation, not specific to rootless mode + "github.com/rancher/k3s/pkg/daemons/executor" + "github.com/rootless-containers/rootlesskit/pkg/parent/cgrouputil" // used for cgroup2 evacuation, not specific to rootless mode "github.com/sirupsen/logrus" "k8s.io/component-base/logs" _ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration @@ -18,15 +20,6 @@ const ( windowsPrefix = "npipe://" ) -type CgroupCheck struct { - KubeletRoot string - RuntimeRoot string - HasCFS bool - HasPIDs bool - IsV2 bool - V2Evac bool // cgroupv2 needs evacuation of procs from / -} - func Agent(config *config.Agent) error { rand.Seed(time.Now().UTC().UnixNano()) @@ -51,7 +44,16 @@ func startKubeProxy(cfg *config.Agent) error { } func startKubelet(cfg *config.Agent) error { - argsMap := kubeletArgs(cfg) + cgroupsCheck := cgroups.CheckCgroups() + if cgroupsCheck.V2Evac { + // evacuate processes from cgroup / to /init + if err := cgrouputil.EvacuateCgroup2("init"); err != nil { + logrus.Errorf("failed to evacuate cgroup2: %+v", err) + return err + } + } + + argsMap := kubeletArgs(cfg, cgroupsCheck) args := config.GetArgsList(argsMap, cfg.ExtraKubeletArgs) logrus.Infof("Running kubelet %s", config.ArgString(args)) diff --git a/pkg/daemons/agent/agent_linux.go b/pkg/daemons/agent/agent_linux.go index 456032bd75e8..a055e58bf10a 100644 --- a/pkg/daemons/agent/agent_linux.go +++ b/pkg/daemons/agent/agent_linux.go @@ -58,7 +58,7 @@ func kubeProxyArgs(cfg *config.Agent) map[string]string { return argsMap } -func kubeletArgs(cfg *config.Agent) map[string]string { +func kubeletArgs(cfg *config.Agent, cgroupsInfo cgroups.CgroupCheck) map[string]string { argsMap := map[string]string{ "healthz-bind-address": "127.0.0.1", "read-only-port": "0", @@ -125,22 +125,21 @@ func kubeletArgs(cfg *config.Agent) map[string]string { if err != nil || defaultIP.String() != cfg.NodeIP { argsMap["node-ip"] = cfg.NodeIP } - kubeletRoot, runtimeRoot, hasCFS, hasPIDs := cgroups.CheckCgroups() - if !hasCFS { + if !cgroupsInfo.HasCFS { logrus.Warn("Disabling CPU quotas due to missing cpu.cfs_period_us") argsMap["cpu-cfs-quota"] = "false" } - if !hasPIDs { + if !cgroupsInfo.HasPIDs { logrus.Warn("Disabling pod PIDs limit feature due to missing cgroup pids support") argsMap["cgroups-per-qos"] = "false" argsMap["enforce-node-allocatable"] = "" argsMap["feature-gates"] = addFeatureGate(argsMap["feature-gates"], "SupportPodPidsLimit=false") } - if kubeletRoot != "" { - argsMap["kubelet-cgroups"] = kubeletRoot + if cgroupsInfo.KubeletRoot != "" { + argsMap["kubelet-cgroups"] = cgroupsInfo.KubeletRoot } - if runtimeRoot != "" { - argsMap["runtime-cgroups"] = runtimeRoot + if cgroupsInfo.RuntimeRoot != "" { + argsMap["runtime-cgroups"] = cgroupsInfo.RuntimeRoot } if system.RunningInUserNS() { argsMap["feature-gates"] = addFeatureGate(argsMap["feature-gates"], "DevicePlugins=false") @@ -162,7 +161,7 @@ func kubeletArgs(cfg *config.Agent) map[string]string { } if cfg.Rootless { - createRootlessConfig(argsMap, hasCFS, hasCFS) + createRootlessConfig(argsMap, cgroupsInfo.HasCFS, cgroupsInfo.HasCFS) } if cfg.ProtectKernelDefaults { From 6dbf081f1d60608a3002503401b0c0214d031e22 Mon Sep 17 00:00:00 2001 From: Derek Nola Date: Wed, 15 Sep 2021 12:45:26 -0700 Subject: [PATCH 08/10] Updated PR for merge conflicts. --- pkg/daemons/agent/agent.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index b63c72892607..3b34d0b03a01 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -6,9 +6,9 @@ import ( "os" "time" - "github.com/rancher/k3s/pkg/cgroups" "github.com/rancher/k3s/pkg/agent/config" "github.com/rancher/k3s/pkg/agent/proxy" + "github.com/rancher/k3s/pkg/cgroups" daemonconfig "github.com/rancher/k3s/pkg/daemons/config" "github.com/rancher/k3s/pkg/daemons/executor" "github.com/rootless-containers/rootlesskit/pkg/parent/cgrouputil" // used for cgroup2 evacuation, not specific to rootless mode @@ -60,7 +60,7 @@ func startKubelet(ctx context.Context, cfg *daemonconfig.Agent) error { } } - argsMap := kubeletArgs(cfg) + argsMap := kubeletArgs(cfg, cgroupsCheck) args := daemonconfig.GetArgs(argsMap, cfg.ExtraKubeletArgs) logrus.Infof("Running kubelet %s", daemonconfig.ArgString(args)) From f1c6d1f4864660b42cdd1a32b749d154b75ebffe Mon Sep 17 00:00:00 2001 From: Derek Nola Date: Wed, 15 Sep 2021 13:50:38 -0700 Subject: [PATCH 09/10] Go fmt fix --- pkg/daemons/agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index 3b34d0b03a01..4689d0a9b1b0 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -51,7 +51,7 @@ func startKubeProxy(ctx context.Context, cfg *daemonconfig.Agent) error { } func startKubelet(ctx context.Context, cfg *daemonconfig.Agent) error { - cgroupsCheck := cgroups.CheckCgroups() + cgroupsCheck := cgroups.CheckCgroups() if cgroupsCheck.V2Evac { // evacuate processes from cgroup / to /init if err := cgrouputil.EvacuateCgroup2("init"); err != nil { From 20d97a98cada03e4f1766a6c45f361e86e395da5 Mon Sep 17 00:00:00 2001 From: Derek Nola Date: Thu, 16 Sep 2021 14:12:47 -0700 Subject: [PATCH 10/10] Fix to go fmt again --- pkg/daemons/agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index 4689d0a9b1b0..9338ac3f7636 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -59,7 +59,7 @@ func startKubelet(ctx context.Context, cfg *daemonconfig.Agent) error { return err } } - + argsMap := kubeletArgs(cfg, cgroupsCheck) args := daemonconfig.GetArgs(argsMap, cfg.ExtraKubeletArgs)