1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2019 Facebook
8 #include <linux/ptrace.h>
9 #include <linux/sched.h>
10 #include <linux/types.h>
11 #include <bpf/bpf_helpers.h>
13 typedef uint32_t pid_t
;
14 struct task_struct
{};
16 #define TASK_COMM_LEN 16
17 #define PERF_MAX_STACK_DEPTH 127
19 #define STROBE_TYPE_INVALID 0
20 #define STROBE_TYPE_INT 1
21 #define STROBE_TYPE_STR 2
22 #define STROBE_TYPE_MAP 3
24 #define STACK_TABLE_EPOCH_SHIFT 20
25 #define STROBE_MAX_STR_LEN 1
26 #define STROBE_MAX_CFGS 32
27 #define STROBE_MAX_PAYLOAD \
28 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \
29 STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
31 struct strobe_value_header
{
33 * meaning depends on type:
34 * 1. int: 0, if value not set, 1 otherwise
35 * 2. str: 1 always, whether value is set or not is determined by ptr
36 * 3. map: 1 always, pointer points to additional struct with number
37 * of entries (up to STROBE_MAX_MAP_ENTRIES)
41 * _reserved might be used for some future fields/flags, but we always
42 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
43 * bytes in one go and get both header and value
49 * strobe_value_generic is used from BPF probe only, but needs to be a union
50 * of strobe_value_int/strobe_value_str/strobe_value_map
52 struct strobe_value_generic
{
53 struct strobe_value_header header
;
60 struct strobe_value_int
{
61 struct strobe_value_header header
;
65 struct strobe_value_str
{
66 struct strobe_value_header header
;
70 struct strobe_value_map
{
71 struct strobe_value_header header
;
72 const struct strobe_map_raw
* value
;
75 struct strobe_map_entry
{
81 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
82 * corresponding int64 ID, which application can use (or ignore) in whatever
83 * way appropriate. Map is "write-only", there is no way to get data out of
84 * map. Map is intended to be used to provide metadata for profilers and is
85 * not to be used for internal in-app communication. All methods are
88 struct strobe_map_raw
{
90 * general purpose unique ID that's up to application to decide
91 * whether and how to use; for request metadata use case id is unique
92 * request ID that's used to match metadata with stack traces on
93 * Strobelight backend side
96 /* number of used entries in map */
99 * having volatile doesn't change anything on BPF side, but clang
100 * emits warnings for passing `volatile const char *` into
101 * bpf_probe_read_user_str that expects just `const char *`
105 * key/value entries, each consisting of 2 pointers to key and value
108 struct strobe_map_entry entries
[STROBE_MAX_MAP_ENTRIES
];
111 /* Following values define supported values of TLS mode */
112 #define TLS_NOT_SET -1
113 #define TLS_LOCAL_EXEC 0
114 #define TLS_IMM_EXEC 1
115 #define TLS_GENERAL_DYN 2
118 * structure that universally represents TLS location (both for static
119 * executables and shared libraries)
121 struct strobe_value_loc
{
123 * tls_mode defines what TLS mode was used for particular metavariable:
124 * - -1 (TLS_NOT_SET) - no metavariable;
125 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
126 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
127 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
128 * Local Dynamic mode is not yet supported, because never seen in
129 * practice. Mode defines how offset field is interpreted. See
130 * calc_location() in below for details.
134 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
135 * tpidr_el0 for aarch64).
136 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
137 * from thread pointer;
138 * TLS_GENERAL_DYN: absolute address of double GOT entry
139 * containing tls_index_t struct;
144 struct strobemeta_cfg
{
145 int64_t req_meta_idx
;
146 struct strobe_value_loc int_locs
[STROBE_MAX_INTS
];
147 struct strobe_value_loc str_locs
[STROBE_MAX_STRS
];
148 struct strobe_value_loc map_locs
[STROBE_MAX_MAPS
];
151 struct strobe_map_descr
{
155 * cnt <0 - map value isn't set;
156 * 0 - map has id set, but no key/value entries
160 * both key_lens[i] and val_lens[i] should be >0 for present key/value
163 uint16_t key_lens
[STROBE_MAX_MAP_ENTRIES
];
164 uint16_t val_lens
[STROBE_MAX_MAP_ENTRIES
];
167 struct strobemeta_payload
{
168 /* req_id has valid request ID, if req_meta_valid == 1 */
170 uint8_t req_meta_valid
;
172 * mask has Nth bit set to 1, if Nth metavar was present and
175 uint64_t int_vals_set_mask
;
176 int64_t int_vals
[STROBE_MAX_INTS
];
177 /* len is >0 for present values */
178 uint16_t str_lens
[STROBE_MAX_STRS
];
179 /* if map_descrs[i].cnt == -1, metavar is not present/set */
180 struct strobe_map_descr map_descrs
[STROBE_MAX_MAPS
];
182 * payload has compactly packed values of str and map variables in the
183 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
184 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
187 char payload
[STROBE_MAX_PAYLOAD
];
190 struct strobelight_bpf_sample
{
192 char comm
[TASK_COMM_LEN
];
197 struct strobemeta_payload metadata
;
199 * makes it possible to pass (<real payload size> + 1) as data size to
200 * perf_submit() to avoid perf_submit's paranoia about passing zero as
201 * size, as it deduces that <real payload size> might be
202 * **theoretically** zero
204 char dummy_safeguard
;
208 __uint(type
, BPF_MAP_TYPE_PERF_EVENT_ARRAY
);
209 __uint(max_entries
, 32);
210 __uint(key_size
, sizeof(int));
211 __uint(value_size
, sizeof(int));
212 } samples
SEC(".maps");
215 __uint(type
, BPF_MAP_TYPE_STACK_TRACE
);
216 __uint(max_entries
, 16);
217 __uint(key_size
, sizeof(uint32_t));
218 __uint(value_size
, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH
);
219 } stacks_0
SEC(".maps");
222 __uint(type
, BPF_MAP_TYPE_STACK_TRACE
);
223 __uint(max_entries
, 16);
224 __uint(key_size
, sizeof(uint32_t));
225 __uint(value_size
, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH
);
226 } stacks_1
SEC(".maps");
229 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
230 __uint(max_entries
, 1);
231 __type(key
, uint32_t);
232 __type(value
, struct strobelight_bpf_sample
);
233 } sample_heap
SEC(".maps");
236 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
237 __uint(max_entries
, STROBE_MAX_CFGS
);
239 __type(value
, struct strobemeta_cfg
);
240 } strobemeta_cfgs
SEC(".maps");
242 /* Type for the dtv. */
243 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
252 /* Partial definition for tcbhead_t */
253 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
260 * TLS module/offset information for shared library case.
261 * For x86-64, this is mapped onto two entries in GOT.
262 * For aarch64, this is pointed to by second GOT entry.
274 static void *calc_location(struct strobe_value_loc
*loc
, void *tls_base
)
278 * - -1 (TLS_NOT_SET), if no metavar is present;
279 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
280 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
281 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
282 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
283 * This schema allows to use something like:
284 * (tls_mode + 1) * (tls_base + offset)
285 * to get NULL for "no metavar" location, or correct pointer for local
286 * executable mode without doing extra ifs.
288 if (loc
->tls_mode
<= TLS_LOCAL_EXEC
) {
289 /* static executable is simple, we just have offset from
291 void *addr
= tls_base
+ loc
->offset
;
292 /* multiply by (tls_mode + 1) to get NULL, if we have no
293 * metavar in this slot */
294 return (void *)((loc
->tls_mode
+ 1) * (int64_t)addr
);
297 * Other modes are more complicated, we need to jump through few hoops.
299 * For immediate executable mode (currently supported only for aarch64):
300 * - loc->offset is pointing to a GOT entry containing fixed offset
301 * relative to tls_base;
303 * For general dynamic mode:
304 * - loc->offset is pointing to a beginning of double GOT entries;
305 * - (for aarch64 only) second entry points to tls_index_t struct;
306 * - (for x86-64 only) two GOT entries are already tls_index_t;
307 * - tls_index_t->module is used to find start of TLS section in
308 * which variable resides;
309 * - tls_index_t->offset provides offset within that TLS section,
310 * pointing to value of variable.
312 struct tls_index tls_index
;
316 bpf_probe_read_user(&tls_index
, sizeof(struct tls_index
),
317 (void *)loc
->offset
);
318 /* valid module index is always positive */
319 if (tls_index
.module
> 0) {
320 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
321 bpf_probe_read_user(&dtv
, sizeof(dtv
),
322 &((struct tcbhead
*)tls_base
)->dtv
);
323 dtv
+= tls_index
.module
;
327 bpf_probe_read_user(&tls_ptr
, sizeof(void *), dtv
);
328 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
329 return tls_ptr
&& tls_ptr
!= (void *)-1
330 ? tls_ptr
+ tls_index
.offset
339 static void read_int_var(struct strobemeta_cfg
*cfg
,
340 size_t idx
, void *tls_base
,
341 struct strobe_value_generic
*value
,
342 struct strobemeta_payload
*data
)
344 void *location
= calc_location(&cfg
->int_locs
[idx
], tls_base
);
348 bpf_probe_read_user(value
, sizeof(struct strobe_value_generic
), location
);
349 data
->int_vals
[idx
] = value
->val
;
350 if (value
->header
.len
)
351 data
->int_vals_set_mask
|= (1 << idx
);
354 static __always_inline
uint64_t read_str_var(struct strobemeta_cfg
*cfg
,
355 size_t idx
, void *tls_base
,
356 struct strobe_value_generic
*value
,
357 struct strobemeta_payload
*data
,
363 data
->str_lens
[idx
] = 0;
364 location
= calc_location(&cfg
->str_locs
[idx
], tls_base
);
368 bpf_probe_read_user(value
, sizeof(struct strobe_value_generic
), location
);
369 len
= bpf_probe_read_user_str(payload
, STROBE_MAX_STR_LEN
, value
->ptr
);
371 * if bpf_probe_read_user_str returns error (<0), due to casting to
372 * unsinged int, it will become big number, so next check is
373 * sufficient to check for errors AND prove to BPF verifier, that
374 * bpf_probe_read_user_str won't return anything bigger than
377 if (len
> STROBE_MAX_STR_LEN
)
380 data
->str_lens
[idx
] = len
;
384 static __always_inline
void *read_map_var(struct strobemeta_cfg
*cfg
,
385 size_t idx
, void *tls_base
,
386 struct strobe_value_generic
*value
,
387 struct strobemeta_payload
*data
,
390 struct strobe_map_descr
* descr
= &data
->map_descrs
[idx
];
391 struct strobe_map_raw map
;
395 descr
->tag_len
= 0; /* presume no tag is set */
396 descr
->cnt
= -1; /* presume no value is set */
398 location
= calc_location(&cfg
->map_locs
[idx
], tls_base
);
402 bpf_probe_read_user(value
, sizeof(struct strobe_value_generic
), location
);
403 if (bpf_probe_read_user(&map
, sizeof(struct strobe_map_raw
), value
->ptr
))
407 descr
->cnt
= map
.cnt
;
408 if (cfg
->req_meta_idx
== idx
) {
409 data
->req_id
= map
.id
;
410 data
->req_meta_valid
= 1;
413 len
= bpf_probe_read_user_str(payload
, STROBE_MAX_STR_LEN
, map
.tag
);
414 if (len
<= STROBE_MAX_STR_LEN
) {
415 descr
->tag_len
= len
;
420 #pragma clang loop unroll(disable)
424 for (int i
= 0; i
< STROBE_MAX_MAP_ENTRIES
; ++i
) {
428 descr
->key_lens
[i
] = 0;
429 len
= bpf_probe_read_user_str(payload
, STROBE_MAX_STR_LEN
,
431 if (len
<= STROBE_MAX_STR_LEN
) {
432 descr
->key_lens
[i
] = len
;
435 descr
->val_lens
[i
] = 0;
436 len
= bpf_probe_read_user_str(payload
, STROBE_MAX_STR_LEN
,
438 if (len
<= STROBE_MAX_STR_LEN
) {
439 descr
->val_lens
[i
] = len
;
454 struct read_var_ctx
{
455 struct strobemeta_payload
*data
;
457 struct strobemeta_cfg
*cfg
;
459 /* value gets mutated */
460 struct strobe_value_generic
*value
;
464 static int read_var_callback(__u32 index
, struct read_var_ctx
*ctx
)
468 if (index
>= STROBE_MAX_INTS
)
470 read_int_var(ctx
->cfg
, index
, ctx
->tls_base
, ctx
->value
, ctx
->data
);
473 if (index
>= STROBE_MAX_MAPS
)
475 ctx
->payload
= read_map_var(ctx
->cfg
, index
, ctx
->tls_base
,
476 ctx
->value
, ctx
->data
, ctx
->payload
);
479 if (index
>= STROBE_MAX_STRS
)
481 ctx
->payload
+= read_str_var(ctx
->cfg
, index
, ctx
->tls_base
,
482 ctx
->value
, ctx
->data
, ctx
->payload
);
487 #endif /* USE_BPF_LOOP */
490 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
491 * pointer to *right after* payload ends
498 static void *read_strobe_meta(struct task_struct
*task
,
499 struct strobemeta_payload
*data
)
501 pid_t pid
= bpf_get_current_pid_tgid() >> 32;
502 struct strobe_value_generic value
= {0};
503 struct strobemeta_cfg
*cfg
;
504 void *tls_base
, *payload
;
506 cfg
= bpf_map_lookup_elem(&strobemeta_cfgs
, &pid
);
510 data
->int_vals_set_mask
= 0;
511 data
->req_meta_valid
= 0;
512 payload
= data
->payload
;
514 * we don't have struct task_struct definition, it should be:
515 * tls_base = (void *)task->thread.fsbase;
517 tls_base
= (void *)task
;
520 struct read_var_ctx ctx
= {
522 .tls_base
= tls_base
,
529 ctx
.type
= READ_INT_VAR
;
530 err
= bpf_loop(STROBE_MAX_INTS
, read_var_callback
, &ctx
, 0);
531 if (err
!= STROBE_MAX_INTS
)
534 ctx
.type
= READ_STR_VAR
;
535 err
= bpf_loop(STROBE_MAX_STRS
, read_var_callback
, &ctx
, 0);
536 if (err
!= STROBE_MAX_STRS
)
539 ctx
.type
= READ_MAP_VAR
;
540 err
= bpf_loop(STROBE_MAX_MAPS
, read_var_callback
, &ctx
, 0);
541 if (err
!= STROBE_MAX_MAPS
)
545 #pragma clang loop unroll(disable)
548 #endif /* NO_UNROLL */
549 for (int i
= 0; i
< STROBE_MAX_INTS
; ++i
) {
550 read_int_var(cfg
, i
, tls_base
, &value
, data
);
553 #pragma clang loop unroll(disable)
556 #endif /* NO_UNROLL */
557 for (int i
= 0; i
< STROBE_MAX_STRS
; ++i
) {
558 payload
+= read_str_var(cfg
, i
, tls_base
, &value
, data
, payload
);
561 #pragma clang loop unroll(disable)
564 #endif /* NO_UNROLL */
565 for (int i
= 0; i
< STROBE_MAX_MAPS
; ++i
) {
566 payload
= read_map_var(cfg
, i
, tls_base
, &value
, data
, payload
);
568 #endif /* USE_BPF_LOOP */
571 * return pointer right after end of payload, so it's possible to
572 * calculate exact amount of useful data that needs to be sent
577 SEC("raw_tracepoint/kfree_skb")
578 int on_event(struct pt_regs
*ctx
) {
579 pid_t pid
= bpf_get_current_pid_tgid() >> 32;
580 struct strobelight_bpf_sample
* sample
;
581 struct task_struct
*task
;
586 sample
= bpf_map_lookup_elem(&sample_heap
, &zero
);
588 return 0; /* this will never happen */
591 bpf_get_current_comm(&sample
->comm
, TASK_COMM_LEN
);
592 ktime_ns
= bpf_ktime_get_ns();
593 sample
->ktime
= ktime_ns
;
595 task
= (struct task_struct
*)bpf_get_current_task();
596 sample_end
= read_strobe_meta(task
, &sample
->metadata
);
597 sample
->has_meta
= sample_end
!= NULL
;
598 sample_end
= sample_end
? : &sample
->metadata
;
600 if ((ktime_ns
>> STACK_TABLE_EPOCH_SHIFT
) & 1) {
601 sample
->kernel_stack_id
= bpf_get_stackid(ctx
, &stacks_1
, 0);
602 sample
->user_stack_id
= bpf_get_stackid(ctx
, &stacks_1
, BPF_F_USER_STACK
);
604 sample
->kernel_stack_id
= bpf_get_stackid(ctx
, &stacks_0
, 0);
605 sample
->user_stack_id
= bpf_get_stackid(ctx
, &stacks_0
, BPF_F_USER_STACK
);
608 uint64_t sample_size
= sample_end
- (void *)sample
;
609 /* should always be true */
610 if (sample_size
< sizeof(struct strobelight_bpf_sample
))
611 bpf_perf_event_output(ctx
, &samples
, 0, sample
, 1 + sample_size
);
615 char _license
[] SEC("license") = "GPL";