-
Notifications
You must be signed in to change notification settings - Fork 245
/
Copy pathcgroup-rstat-flushing.bpf.c
413 lines (348 loc) · 11.6 KB
/
cgroup-rstat-flushing.bpf.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
/*
* Measure cgroup rstat (recursive stats) flushing overhead and latency.
*
* Loosely based on bpftrace script cgroup_rstat_tracepoint.bt
* - https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
*
* Depends on tracepoints added in kernel v6.10
*/
#include <vmlinux.h>
#include <bpf/bpf_tracing.h>
#include "maps.bpf.h"
#include <linux/errno.h>
#define MAX_CGROUP_LEVELS 5
/* From: linux/include/linux/cgroup.h */
static inline u64 cgroup_id(const struct cgroup *cgrp)
{
return cgrp->kn->id;
}
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, MAX_CGROUP_LEVELS + 1);
__type(key, u32);
__type(value, u64);
} cgroup_rstat_flush_total SEC(".maps");
// 24 buckets for latency, max range is 0.83s .. 1.67s
#define MAX_LATENCY_SLOT 24
struct hist_key_t {
u32 bucket;
};
struct hist_lvl_key_t {
u16 level;
u16 bucket;
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, MAX_LATENCY_SLOT + 2);
__type(key, struct hist_key_t);
__type(value, u64);
} cgroup_rstat_lock_wait_seconds SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 1024 * 128);
__type(key, u64);
__type(value, u64);
} start_wait SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, MAX_LATENCY_SLOT + 2);
__type(key, struct hist_key_t);
__type(value, u64);
} cgroup_rstat_lock_hold_seconds SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 1024 * 128);
__type(key, u64);
__type(value, u64);
} start_hold SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(max_entries, (MAX_LATENCY_SLOT + 2) * MAX_CGROUP_LEVELS);
__type(key, struct hist_lvl_key_t);
__type(value, u64);
} cgroup_rstat_flush_latency_seconds SEC(".maps");
struct start_time_key_t {
u64 pid_tgid;
u64 cgrp_id;
};
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 1024 * 128);
__type(key, struct start_time_key_t);
__type(value, u64);
} start_flush SEC(".maps");
struct flush_key_t {
u64 cgrp_id;
u32 level;
} __attribute__((packed));
/* This provide easy way to enable per cgroup flush recording
* Default this is disabled, as it generates too much data.
* For troubleshooting purposes this can be enabled on some servers.
*/
#undef CONFIG_TRACK_PER_CGROUP_FLUSH
#ifdef CONFIG_TRACK_PER_CGROUP_FLUSH
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(max_entries, 10000);
__type(key, struct flush_key_t);
__type(value, u64);
} cgroup_rstat_flush_nanoseconds_sum SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(max_entries, 10000);
__type(key, struct flush_key_t);
__type(value, u64);
} cgroup_rstat_flush_nanoseconds_count SEC(".maps");
#endif /* CONFIG_TRACK_PER_CGROUP_FLUSH */
/* Complex key for encoding lock properties */
struct lock_key_t {
u8 contended;
u8 yield;
u16 level;
};
#define MAX_LOCK_KEY_ENTRIES 128
#define MAX_ERROR_TYPES 6
enum error_types {
ERR_UNKNOWN = 0,
ERR_NOMEM = 1,
ERR_BUSY = 2,
ERR_EXIST = 3, /* elem already exists */
ERR_NO_ELEM = 4,
ERR_TS_RANGE = 5, /* timestamp zero (out of range) */
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, MAX_ERROR_TYPES);
__type(key, u32);
__type(value, u64);
} cgroup_rstat_map_errors_total SEC(".maps");
/* This provide easy way to disable recording map errors.
* Disabling this reduces BPF code size.
*/
#define CONFIG_TRACK_MAP_ERRORS 1
// #undef CONFIG_TRACK_MAP_ERRORS
void record_map_errors(int err)
{
#ifdef CONFIG_TRACK_MAP_ERRORS
u32 key = ERR_UNKNOWN;
switch (err) {
case -ENOMEM:
key = ERR_NOMEM;
break;
case -EBUSY:
key = ERR_BUSY;
break;
case -EEXIST:
key = ERR_EXIST;
break;
case -ENODATA:
key = ERR_NO_ELEM;
break;
case -ERANGE:
key = ERR_TS_RANGE;
break;
}
increment_map_nosync(&cgroup_rstat_map_errors_total, &key, 1);
#endif /* CONFIG_TRACK_MAP_ERRORS */
}
/* Record timestamp in LRU map */
static __always_inline int record_timestamp_map_key(u64 ts, void *map, void *key)
{
u64 *ts_val;
int err = 0;
ts_val = bpf_map_lookup_elem(map, key);
if (!ts_val) {
err = bpf_map_update_elem(map, key, &ts, BPF_NOEXIST);
if (err)
record_map_errors(err);
} else {
*ts_val = ts;
}
return err;
}
static __always_inline int record_timestamp(u64 ts, void *map)
{
u64 key = bpf_get_current_pid_tgid();
return record_timestamp_map_key(ts, map, &key);
}
/* Get timestamp from LRU map */
static __always_inline u64 get_timestamp_map_key(void *map, void *key)
{
u64 *ts_val;
u64 ts;
ts_val = bpf_map_lookup_elem(map, key);
if (!ts_val) {
/* Can happen due to reload on a live system, where tracepoints using
* get_timestamp() gets activiated before those creating timestamps via
* record_timestamp */
record_map_errors(-ENODATA);
return 0;
}
ts = *ts_val;
*ts_val = 0; /* reset timestamp */
if (ts == 0)
record_map_errors(-ERANGE);
return ts;
}
static __always_inline u64 get_timestamp(void *map)
{
u64 key = bpf_get_current_pid_tgid();
return get_timestamp_map_key(map, &key);
}
/* Total counter for obtaining lock together with state (prometheus labels)
*
* State for cgroup level, contended and yield case.
*
* The problematic/interesting case is when the lock was contended (prior to
* obtaining the lock). This "contended" label is key in analyzing locking
* issues.
*
* Kernel can yield the rstat lock when walking individial CPU stats. This
* leads to "interresting" concurrency issues. Thus, having a label "yield"
* can help diagnose.
*/
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(max_entries, MAX_LOCK_KEY_ENTRIES);
__type(key, struct lock_key_t);
__type(value, u64);
} cgroup_rstat_locked_total SEC(".maps");
/** Measurement#1: lock rates
* =========================
For locks, the problematic/interesting case is when the lock was contended.
Simply counting the lock+unlock is complicated by yielding the lock in the main
flushing loop (in cgroup_rstat_flush_locked()). The tracepoints "cpu" argument
will be (minus) -1 when the lock is not a yielded lock.
Concern: The lock rates will vary (a lot) and aggregating this as an average
will not capture spikes. Especially given Prometheus capture intervals only
happens every 53 seconds.
*/
SEC("tp_btf/cgroup_rstat_locked")
int BPF_PROG(rstat_locked, struct cgroup *cgrp, int cpu, bool contended)
{
u64 now = bpf_ktime_get_ns();
struct lock_key_t lock_key = { 0 };
u32 level = cgrp->level;
// int err;
if (level > MAX_CGROUP_LEVELS)
level = MAX_CGROUP_LEVELS;
if (cpu >= 0)
lock_key.yield = 1;
lock_key.contended = contended;
lock_key.level = (level & 0xFF);
increment_map_nosync(&cgroup_rstat_locked_total, &lock_key, 1);
/* Lock hold time start */
record_timestamp(now, &start_hold);
/* Lock contended event happened prior to obtaining this lock.
* Get back start "wait" timestamp that was recorded.
*/
if (contended) {
u64 start_wait_ts;
struct hist_key_t key;
u64 delta;
/* Lock wait time */
start_wait_ts = get_timestamp(&start_wait);
if (start_wait_ts) {
delta = (now - start_wait_ts) / 100; /* 0.1 usec */
increment_exp2_histogram_nosync(&cgroup_rstat_lock_wait_seconds, key, delta, MAX_LATENCY_SLOT);
}
}
return 0;
}
SEC("tp_btf/cgroup_rstat_unlock")
int BPF_PROG(rstat_unlock, struct cgroup *cgrp, int cpu, bool contended)
{
u64 now = bpf_ktime_get_ns();
struct hist_key_t key;
u64 start_hold_ts;
u64 delta;
/* Lock hold time */
start_hold_ts = get_timestamp(&start_hold);
if (start_hold_ts) {
delta = (now - start_hold_ts) / 100; /* 0.1 usec */
increment_exp2_histogram_nosync(&cgroup_rstat_lock_hold_seconds, key, delta, MAX_LATENCY_SLOT);
}
return 0;
}
/** Measurement#2: latency/delay caused by flush
* ============================================
Measure both time waiting for the lock, and time spend holding the lock.
This should be a histogram (for later latency heatmap).
*/
SEC("tp_btf/cgroup_rstat_lock_contended")
int BPF_PROG(rstat_lock_contended, struct cgroup *cgrp, int cpu, bool contended)
{
u64 now = bpf_ktime_get_ns();
record_timestamp(now, &start_wait);
return 0;
}
/** Measurement#3: flush rate
* =========================
Simply count invocations of cgroup_rstat_flush_locked().
Concern: The flush rates will vary (a lot), e.g. then cadvisor collects stats
for all cgroups in the system, or when kswapd does concurrent flushing (of root
cgroup). Averaging this (over approx 1 minute) gives the wrong impression.
Mitigation workaround: Store counters per cgroup "level" (level=0 is root).
This will allow us to separate root-cgroup flushes from cadvisor walking all
cgroup levels.
*/
SEC("fentry/cgroup_rstat_flush_locked")
int BPF_PROG(cgroup_rstat_flush_locked, struct cgroup *cgrp)
{
u64 now = bpf_ktime_get_ns();
u32 level_key = cgrp->level;
if (level_key > MAX_CGROUP_LEVELS)
level_key = MAX_CGROUP_LEVELS;
increment_map_nosync(&cgroup_rstat_flush_total, &level_key, 1);
/* Flush time latency start */
struct start_time_key_t key_ts;
key_ts.pid_tgid = bpf_get_current_pid_tgid();
key_ts.cgrp_id = cgroup_id(cgrp);
record_timestamp_map_key(now, &start_flush, &key_ts);
return 0;
}
SEC("fexit/cgroup_rstat_flush_locked")
int BPF_PROG(cgroup_rstat_flush_locked_exit, struct cgroup *cgrp)
{
u64 now = bpf_ktime_get_ns();
struct start_time_key_t key_ts;
u64 start_flush_ts;
u64 delta, delta_ns;
key_ts.pid_tgid = bpf_get_current_pid_tgid();
key_ts.cgrp_id = cgroup_id(cgrp);
start_flush_ts = get_timestamp_map_key(&start_flush, &key_ts);
if (start_flush_ts == 0)
return 0;
/* Flush time latency */
delta_ns = now - start_flush_ts;
delta = delta_ns / 100; /* 0.1 usec */
struct hist_lvl_key_t key = { .level = (u16) cgrp->level };
increment_exp2_histogram_nosync(&cgroup_rstat_flush_latency_seconds, key, delta, MAX_LATENCY_SLOT);
/*
* ebpf_exporter will also have:
* ebpf_exporter_cgroup_rstat_flush_latency_seconds_sum and
* ebpf_exporter_cgroup_rstat_flush_latency_seconds_count
*
* The Prometheus idea behind having _seconds_count and _seconds_sum
* =================================================================
* The _seconds_count is number of observed flush calls that have been made, so
* rate(_seconds_count[1m]) in query returns per-second rate of flushes.
*
* The _seconds_sum is the sum of the delta values, so rate(flush_seconds_sum[1m])
* is the amount of time spent for flushes per second.
*
* Divide these two expressions to get the average latency over the last minute.
* The full expression for average latency would be:
* rate(_seconds_sum[1m]) / rate(_seconds_count[1m])
*/
#ifdef CONFIG_TRACK_PER_CGROUP_FLUSH
/* Per cgroup record average latency via nanoseconds_count and nanoseconds_sum */
struct flush_key_t flush_key;
flush_key.cgrp_id = cgroup_id(cgrp);
flush_key.level = cgrp->level;
increment_map_nosync(&cgroup_rstat_flush_nanoseconds_sum, &flush_key, delta_ns);
increment_map_nosync(&cgroup_rstat_flush_nanoseconds_count, &flush_key, 1);
#endif
return 0;
}
char LICENSE[] SEC("license") = "GPL";