Skip to content

Commit

Permalink
Merge pull request #617 from alexhudspith/cgroup2-swap-534
Browse files Browse the repository at this point in the history
Fix swap handling for cgroups v2
  • Loading branch information
stgraber authored Mar 27, 2024
2 parents 26dbfd2 + a6c309b commit 532c61f
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 69 deletions.
4 changes: 3 additions & 1 deletion src/bindings.c
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,7 @@ static void __attribute__((constructor)) lxcfs_init(void)
{
__do_close int init_ns = -EBADF, root_fd = -EBADF,
pidfd = -EBADF;
__do_free char *cgroup = NULL;
int i = 0;
pid_t pid;
struct hierarchy *hierarchy;
Expand Down Expand Up @@ -920,7 +921,8 @@ static void __attribute__((constructor)) lxcfs_init(void)
lxcfs_info("Kernel supports pidfds");
}

can_use_swap = cgroup_ops->can_use_swap(cgroup_ops);
cgroup = get_pid_cgroup(pid, "memory");
can_use_swap = cgroup && cgroup_ops->can_use_swap(cgroup_ops, cgroup);
if (can_use_swap)
lxcfs_info("Kernel supports swap accounting");
else
Expand Down
33 changes: 12 additions & 21 deletions src/cgroups/cgfsng.c
Original file line number Diff line number Diff line change
Expand Up @@ -631,34 +631,25 @@ static int cgfsng_get_memory_slabinfo_fd(struct cgroup_ops *ops, const char *cgr
return openat(h->fd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
}

static bool cgfsng_can_use_swap(struct cgroup_ops *ops)
static bool cgfsng_can_use_swap(struct cgroup_ops *ops, const char *cgroup)
{
bool has_swap = false;
__do_free char *cgroup_rel = NULL, *junk_value = NULL;
const char *file;
struct hierarchy *h;

h = ops->get_hierarchy(ops, "memory");
if (!h)
return false;

if (is_unified_hierarchy(h)) {
if (faccessat(h->fd, "memory.swap.max", F_OK, 0))
return false;

if (faccessat(h->fd, "memory.swap.current", F_OK, 0))
return false;

has_swap = true;
} else {
if (faccessat(h->fd, "memory.memsw.limit_in_bytes", F_OK, 0))
return false;

if (faccessat(h->fd, "memory.memsw.usage_in_bytes", F_OK, 0))
return false;

has_swap = true;
}

return has_swap;
cgroup_rel = must_make_path_relative(cgroup, NULL);
file = is_unified_hierarchy(h) ? "memory.swap.current" : "memory.memsw.usage_in_bytes";
/* For v2, we need to look at the lower levels of the hierarchy because
* no 'memory.swap.current' file exists at the root. We must search
* upwards in the hierarchy in case memory accounting is disabled via
* cgroup.subtree_control for the given cgroup itself.
*/
int ret = cgroup_walkup_to_root(ops->cgroup2_root_fd, h->fd, cgroup_rel, file, &junk_value);
return ret == 0;
}

static int cgfsng_get_memory_stats(struct cgroup_ops *ops, const char *cgroup,
Expand Down
2 changes: 1 addition & 1 deletion src/cgroups/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ struct cgroup_ops {
char **value);
int (*get_memory_slabinfo_fd)(struct cgroup_ops *ops,
const char *cgroup);
bool (*can_use_swap)(struct cgroup_ops *ops);
bool (*can_use_swap)(struct cgroup_ops *ops, const char *cgroup);

/* cpuset */
int (*get_cpuset_cpus)(struct cgroup_ops *ops, const char *cgroup,
Expand Down
123 changes: 77 additions & 46 deletions src/proc_fuse.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,21 +239,37 @@ __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
return 0;
}

static uint64_t get_memlimit(const char *cgroup, bool swap)
/**
* Gets a non-hierarchical memory controller limit, or UINT64_MAX if no limit is
* in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise
* reads the memory (RAM) limits.
*
* @returns 0 on success (and sets `*limit`), < 0 on error
*/
static int get_memlimit(const char *cgroup, bool swap, uint64_t *limit)
{
__do_free char *memlimit_str = NULL;
uint64_t memlimit = 0;
uint64_t memlimit = UINT64_MAX;
int ret;

if (swap)
ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
else
ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0)
lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s",
swap ? ".swap" : "", memlimit_str, cgroup);

return memlimit;
if (ret < 0)
return ret;

if (memlimit_str[0]) {
ret = safe_uint64(memlimit_str, &memlimit, 10);
if (ret < 0) {
lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s",
swap ? ".swap" : "", memlimit_str, cgroup);
return ret;
}
}
*limit = memlimit;
return 0;
}

/*
Expand Down Expand Up @@ -318,31 +334,44 @@ static char *gnu_dirname(char *path)
return path;
}

static uint64_t get_min_memlimit(const char *cgroup, bool swap)
/**
* Gets a hierarchical memory controller limit, or UINT64_MAX if no limit is
* in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise
* reads the memory (RAM) limits.
*
* @returns 0 on success (and sets `*limit`), < 0 on error
*/
static int get_min_memlimit(const char *cgroup, bool swap, uint64_t *limit)
{
__do_free char *copy = NULL;
uint64_t memlimit = 0, retlimit = 0;
uint64_t memlimit = UINT64_MAX, retlimit = UINT64_MAX;
int ret;

copy = strdup(cgroup);
if (!copy)
return log_error_errno(0, ENOMEM, "Failed to allocate memory");

retlimit = get_memlimit(copy, swap);
ret = get_memlimit(copy, swap, &retlimit);
if (ret < 0)
return ret;

/*
* If the cgroup doesn't start with / (probably won't happen), dirname()
* will terminate with "" instead of "/"
*/
while (*copy && strcmp(copy, "/") != 0) {
while (retlimit != 0 && *copy && strcmp(copy, "/") != 0) {
char *it = copy;

it = gnu_dirname(it);
memlimit = get_memlimit(it, swap);
if (memlimit > 0 && memlimit < retlimit)
ret = get_memlimit(it, swap, &memlimit);
if (ret < 0)
return ret;
if (memlimit < retlimit)
retlimit = memlimit;
};
}

return retlimit;
*limit = retlimit;
return 0;
}

static inline bool startswith(const char *line, const char *pref)
Expand All @@ -361,30 +390,30 @@ static void get_swap_info(const char *cgroup, uint64_t memlimit,
*swtotal = *swusage = 0;
*memswpriority = 1;

memswlimit = get_min_memlimit(cgroup, true);
if (memswlimit > 0) {
ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) != 0)
return;

if (liblxcfs_memory_is_cgroupv2()) {
*swtotal = memswlimit / 1024;
*swusage = memswusage / 1024;
} else {
if (memlimit > memswlimit)
*swtotal = 0;
else
*swtotal = (memswlimit - memlimit) / 1024;
if (memusage > memswusage || swtotal == 0)
*swusage = 0;
else
*swusage = (memswusage - memusage) / 1024;
}

ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
if (ret >= 0)
safe_uint64(memswpriority_str, memswpriority, 10);
ret = get_min_memlimit(cgroup, true, &memswlimit);
if (ret < 0)
return;
ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) < 0)
return;

if (liblxcfs_memory_is_cgroupv2()) {
*swtotal = memswlimit / 1024;
*swusage = memswusage / 1024;
} else {
if (memlimit > memswlimit)
*swtotal = 0;
else
*swtotal = (memswlimit - memlimit) / 1024;
if (memusage > memswusage || *swtotal == 0)
*swusage = 0;
else
*swusage = (memswusage - memusage) / 1024;
}

ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str);
if (ret >= 0)
safe_uint64(memswpriority_str, memswpriority, 10);
}

static int proc_swaps_read(char *buf, size_t size, off_t offset,
Expand Down Expand Up @@ -432,12 +461,12 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset,
return read_file_fuse("/proc/swaps", buf, size, d);
prune_init_slice(cgroup);

memlimit = get_min_memlimit(cgroup, false);

ret = get_min_memlimit(cgroup, false, &memlimit);
if (ret < 0)
return 0;
ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
if (ret < 0)
return 0;

if (safe_uint64(memusage_str, &memusage, 10) < 0)
lxcfs_error("Failed to convert memusage %s", memusage_str);

Expand All @@ -459,11 +488,13 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset,
}

if (wants_swap) {
/* The total amount of swap is always reported to be the
/* For cgroups v1, the total amount of swap is always reported to be the
lesser of the RAM+SWAP limit or the SWAP device size.
This is because the kernel can swap as much as it
wants and not only up to swtotal. */
swtotal = memlimit / 1024 + swtotal;
if (!liblxcfs_memory_is_cgroupv2())
swtotal = memlimit / 1024 + swtotal;

if (hostswtotal < swtotal) {
swtotal = hostswtotal;
}
Expand Down Expand Up @@ -1318,8 +1349,9 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,
if (!cgroup_parse_memory_stat(cgroup, &mstat))
return read_file_fuse("/proc/meminfo", buf, size, d);

memlimit = get_min_memlimit(cgroup, false);

ret = get_min_memlimit(cgroup, false, &memlimit);
if (ret < 0)
return read_file_fuse("/proc/meminfo", buf, size, d);
/*
* Following values are allowed to fail, because swapaccount might be
* turned off for current kernel.
Expand Down Expand Up @@ -1359,11 +1391,10 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,

sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal);

/* The total amount of swap is always reported to be the
/* In cgroups v1, the total amount of swap is always reported to be the
lesser of the RAM+SWAP limit or the SWAP device size.
This is because the kernel can swap as much as it
wants and not only up to swtotal. */

if (!liblxcfs_memory_is_cgroupv2())
swtotal += memlimit;

Expand Down

0 comments on commit 532c61f

Please sign in to comment.