diff --git a/src/bindings.c b/src/bindings.c index 27c08c38..dc0550ca 100644 --- a/src/bindings.c +++ b/src/bindings.c @@ -866,6 +866,7 @@ static void __attribute__((constructor)) lxcfs_init(void) { __do_close int init_ns = -EBADF, root_fd = -EBADF, pidfd = -EBADF; + __do_free char *cgroup = NULL; int i = 0; pid_t pid; struct hierarchy *hierarchy; @@ -920,7 +921,8 @@ static void __attribute__((constructor)) lxcfs_init(void) lxcfs_info("Kernel supports pidfds"); } - can_use_swap = cgroup_ops->can_use_swap(cgroup_ops); + cgroup = get_pid_cgroup(pid, "memory"); + can_use_swap = cgroup && cgroup_ops->can_use_swap(cgroup_ops, cgroup); if (can_use_swap) lxcfs_info("Kernel supports swap accounting"); else diff --git a/src/cgroups/cgfsng.c b/src/cgroups/cgfsng.c index 2d583c67..7b732926 100644 --- a/src/cgroups/cgfsng.c +++ b/src/cgroups/cgfsng.c @@ -631,34 +631,25 @@ static int cgfsng_get_memory_slabinfo_fd(struct cgroup_ops *ops, const char *cgr return openat(h->fd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); } -static bool cgfsng_can_use_swap(struct cgroup_ops *ops) +static bool cgfsng_can_use_swap(struct cgroup_ops *ops, const char *cgroup) { - bool has_swap = false; + __do_free char *cgroup_rel = NULL, *junk_value = NULL; + const char *file; struct hierarchy *h; h = ops->get_hierarchy(ops, "memory"); if (!h) return false; - if (is_unified_hierarchy(h)) { - if (faccessat(h->fd, "memory.swap.max", F_OK, 0)) - return false; - - if (faccessat(h->fd, "memory.swap.current", F_OK, 0)) - return false; - - has_swap = true; - } else { - if (faccessat(h->fd, "memory.memsw.limit_in_bytes", F_OK, 0)) - return false; - - if (faccessat(h->fd, "memory.memsw.usage_in_bytes", F_OK, 0)) - return false; - - has_swap = true; - } - - return has_swap; + cgroup_rel = must_make_path_relative(cgroup, NULL); + file = is_unified_hierarchy(h) ? "memory.swap.current" : "memory.memsw.usage_in_bytes"; + /* For v2, we need to look at the lower levels of the hierarchy because + * no 'memory.swap.current' file exists at the root. We must search + * upwards in the hierarchy in case memory accounting is disabled via + * cgroup.subtree_control for the given cgroup itself. + */ + int ret = cgroup_walkup_to_root(ops->cgroup2_root_fd, h->fd, cgroup_rel, file, &junk_value); + return ret == 0; } static int cgfsng_get_memory_stats(struct cgroup_ops *ops, const char *cgroup, diff --git a/src/cgroups/cgroup.h b/src/cgroups/cgroup.h index 122e8ebf..afa7db2e 100644 --- a/src/cgroups/cgroup.h +++ b/src/cgroups/cgroup.h @@ -148,7 +148,7 @@ struct cgroup_ops { char **value); int (*get_memory_slabinfo_fd)(struct cgroup_ops *ops, const char *cgroup); - bool (*can_use_swap)(struct cgroup_ops *ops); + bool (*can_use_swap)(struct cgroup_ops *ops, const char *cgroup); /* cpuset */ int (*get_cpuset_cpus)(struct cgroup_ops *ops, const char *cgroup, diff --git a/src/proc_fuse.c b/src/proc_fuse.c index 1d253bba..1049e72d 100644 --- a/src/proc_fuse.c +++ b/src/proc_fuse.c @@ -239,21 +239,37 @@ __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi) return 0; } -static uint64_t get_memlimit(const char *cgroup, bool swap) +/** + * Gets a non-hierarchical memory controller limit, or UINT64_MAX if no limit is + * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise + * reads the memory (RAM) limits. + * + * @returns 0 on success (and sets `*limit`), < 0 on error + */ +static int get_memlimit(const char *cgroup, bool swap, uint64_t *limit) { __do_free char *memlimit_str = NULL; - uint64_t memlimit = 0; + uint64_t memlimit = UINT64_MAX; int ret; if (swap) ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str); else ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str); - if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0) - lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s", - swap ? ".swap" : "", memlimit_str, cgroup); - return memlimit; + if (ret < 0) + return ret; + + if (memlimit_str[0]) { + ret = safe_uint64(memlimit_str, &memlimit, 10); + if (ret < 0) { + lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s", + swap ? ".swap" : "", memlimit_str, cgroup); + return ret; + } + } + *limit = memlimit; + return 0; } /* @@ -318,31 +334,44 @@ static char *gnu_dirname(char *path) return path; } -static uint64_t get_min_memlimit(const char *cgroup, bool swap) +/** + * Gets a hierarchical memory controller limit, or UINT64_MAX if no limit is + * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise + * reads the memory (RAM) limits. + * + * @returns 0 on success (and sets `*limit`), < 0 on error + */ +static int get_min_memlimit(const char *cgroup, bool swap, uint64_t *limit) { __do_free char *copy = NULL; - uint64_t memlimit = 0, retlimit = 0; + uint64_t memlimit = UINT64_MAX, retlimit = UINT64_MAX; + int ret; copy = strdup(cgroup); if (!copy) return log_error_errno(0, ENOMEM, "Failed to allocate memory"); - retlimit = get_memlimit(copy, swap); + ret = get_memlimit(copy, swap, &retlimit); + if (ret < 0) + return ret; /* * If the cgroup doesn't start with / (probably won't happen), dirname() * will terminate with "" instead of "/" */ - while (*copy && strcmp(copy, "/") != 0) { + while (retlimit != 0 && *copy && strcmp(copy, "/") != 0) { char *it = copy; it = gnu_dirname(it); - memlimit = get_memlimit(it, swap); - if (memlimit > 0 && memlimit < retlimit) + ret = get_memlimit(it, swap, &memlimit); + if (ret < 0) + return ret; + if (memlimit < retlimit) retlimit = memlimit; - }; + } - return retlimit; + *limit = retlimit; + return 0; } static inline bool startswith(const char *line, const char *pref) @@ -361,30 +390,30 @@ static void get_swap_info(const char *cgroup, uint64_t memlimit, *swtotal = *swusage = 0; *memswpriority = 1; - memswlimit = get_min_memlimit(cgroup, true); - if (memswlimit > 0) { - ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str); - if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) != 0) - return; - - if (liblxcfs_memory_is_cgroupv2()) { - *swtotal = memswlimit / 1024; - *swusage = memswusage / 1024; - } else { - if (memlimit > memswlimit) - *swtotal = 0; - else - *swtotal = (memswlimit - memlimit) / 1024; - if (memusage > memswusage || swtotal == 0) - *swusage = 0; - else - *swusage = (memswusage - memusage) / 1024; - } - - ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str); - if (ret >= 0) - safe_uint64(memswpriority_str, memswpriority, 10); + ret = get_min_memlimit(cgroup, true, &memswlimit); + if (ret < 0) + return; + ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str); + if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) < 0) + return; + + if (liblxcfs_memory_is_cgroupv2()) { + *swtotal = memswlimit / 1024; + *swusage = memswusage / 1024; + } else { + if (memlimit > memswlimit) + *swtotal = 0; + else + *swtotal = (memswlimit - memlimit) / 1024; + if (memusage > memswusage || *swtotal == 0) + *swusage = 0; + else + *swusage = (memswusage - memusage) / 1024; } + + ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str); + if (ret >= 0) + safe_uint64(memswpriority_str, memswpriority, 10); } static int proc_swaps_read(char *buf, size_t size, off_t offset, @@ -432,12 +461,12 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, return read_file_fuse("/proc/swaps", buf, size, d); prune_init_slice(cgroup); - memlimit = get_min_memlimit(cgroup, false); - + ret = get_min_memlimit(cgroup, false, &memlimit); + if (ret < 0) + return 0; ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str); if (ret < 0) return 0; - if (safe_uint64(memusage_str, &memusage, 10) < 0) lxcfs_error("Failed to convert memusage %s", memusage_str); @@ -459,11 +488,13 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, } if (wants_swap) { - /* The total amount of swap is always reported to be the + /* For cgroups v1, the total amount of swap is always reported to be the lesser of the RAM+SWAP limit or the SWAP device size. This is because the kernel can swap as much as it wants and not only up to swtotal. */ - swtotal = memlimit / 1024 + swtotal; + if (!liblxcfs_memory_is_cgroupv2()) + swtotal = memlimit / 1024 + swtotal; + if (hostswtotal < swtotal) { swtotal = hostswtotal; } @@ -1318,8 +1349,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, if (!cgroup_parse_memory_stat(cgroup, &mstat)) return read_file_fuse("/proc/meminfo", buf, size, d); - memlimit = get_min_memlimit(cgroup, false); - + ret = get_min_memlimit(cgroup, false, &memlimit); + if (ret < 0) + return read_file_fuse("/proc/meminfo", buf, size, d); /* * Following values are allowed to fail, because swapaccount might be * turned off for current kernel. @@ -1359,11 +1391,10 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal); - /* The total amount of swap is always reported to be the + /* In cgroups v1, the total amount of swap is always reported to be the lesser of the RAM+SWAP limit or the SWAP device size. This is because the kernel can swap as much as it wants and not only up to swtotal. */ - if (!liblxcfs_memory_is_cgroupv2()) swtotal += memlimit;