diff options
Diffstat (limited to 'tools/lib')
-rw-r--r-- | tools/lib/bpf/bpf_helpers.h | 31 | ||||
-rw-r--r-- | tools/lib/bpf/bpf_tracing.h | 107 | ||||
-rw-r--r-- | tools/lib/bpf/btf.c | 32 | ||||
-rw-r--r-- | tools/lib/bpf/btf.h | 25 | ||||
-rw-r--r-- | tools/lib/bpf/btf_dump.c | 2 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf.c | 106 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf.h | 111 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf.map | 10 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf_probes.c | 1 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf_version.h | 2 | ||||
-rw-r--r-- | tools/lib/bpf/nlattr.c | 2 | ||||
-rw-r--r-- | tools/lib/bpf/ringbuf.c | 271 | ||||
-rw-r--r-- | tools/lib/bpf/usdt.c | 2 |
13 files changed, 596 insertions, 106 deletions
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 867b734839dd..d37c4fe2849d 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -131,7 +131,7 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if (!defined(__clang__) || __clang_major__ >= 8) && defined(__bpf__) +#if __clang_major__ >= 8 && defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -139,8 +139,8 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) __bpf_unreachable(); /* - * Provide a hard guarantee that the compiler won't optimize setting r2 - * (map pointer) and r3 (constant map index) from _different paths_ ending + * Provide a hard guarantee that LLVM won't optimize setting r2 (map + * pointer) and r3 (constant map index) from _different paths_ ending * up at the _same_ call insn as otherwise we won't be able to use the * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key @@ -148,37 +148,18 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) * * Note on clobber list: we need to stay in-line with BPF calling * convention, so even if we don't end up using r0, r4, r5, we need - * to mark them as clobber so that the compiler doesn't end up using - * them before / after the call. + * to mark them as clobber so that LLVM doesn't end up using them + * before / after the call. */ - asm volatile( -#ifdef __clang__ - "r1 = %[ctx]\n\t" + asm volatile("r1 = %[ctx]\n\t" "r2 = %[map]\n\t" "r3 = %[slot]\n\t" -#else - "mov %%r1,%[ctx]\n\t" - "mov %%r2,%[map]\n\t" - "mov %%r3,%[slot]\n\t" -#endif "call 12" :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot) : "r0", "r1", "r2", "r3", "r4", "r5"); } #endif -/* - * Helper structure used by eBPF C program - * to describe BPF map attributes to libbpf loader - */ -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; -} __attribute__((deprecated("use BTF-defined maps in .maps section"))); - enum libbpf_pin_type { LIBBPF_PIN_NONE, /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 5fdb93da423b..2972dc25ff72 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -438,6 +438,113 @@ typeof(name(0)) name(unsigned long long *ctx) \ static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args) +#ifndef ___bpf_nth2 +#define ___bpf_nth2(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ + _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N +#endif +#ifndef ___bpf_narg2 +#define ___bpf_narg2(...) \ + ___bpf_nth2(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, \ + 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0) +#endif + +#define ___bpf_treg_cnt(t) \ + __builtin_choose_expr(sizeof(t) == 1, 1, \ + __builtin_choose_expr(sizeof(t) == 2, 1, \ + __builtin_choose_expr(sizeof(t) == 4, 1, \ + __builtin_choose_expr(sizeof(t) == 8, 1, \ + __builtin_choose_expr(sizeof(t) == 16, 2, \ + (void)0))))) + +#define ___bpf_reg_cnt0() (0) +#define ___bpf_reg_cnt1(t, x) (___bpf_reg_cnt0() + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt2(t, x, args...) (___bpf_reg_cnt1(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt3(t, x, args...) (___bpf_reg_cnt2(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt4(t, x, args...) (___bpf_reg_cnt3(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt5(t, x, args...) (___bpf_reg_cnt4(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt6(t, x, args...) (___bpf_reg_cnt5(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt7(t, x, args...) (___bpf_reg_cnt6(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt8(t, x, args...) (___bpf_reg_cnt7(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt9(t, x, args...) (___bpf_reg_cnt8(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt10(t, x, args...) (___bpf_reg_cnt9(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt11(t, x, args...) (___bpf_reg_cnt10(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt12(t, x, args...) (___bpf_reg_cnt11(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt(args...) ___bpf_apply(___bpf_reg_cnt, ___bpf_narg2(args))(args) + +#define ___bpf_union_arg(t, x, n) \ + __builtin_choose_expr(sizeof(t) == 1, ({ union { __u8 z[1]; t x; } ___t = { .z = {ctx[n]}}; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 2, ({ union { __u16 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 4, ({ union { __u32 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 8, ({ union { __u64 z[1]; t x; } ___t = {.z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 16, ({ union { __u64 z[2]; t x; } ___t = {.z = {ctx[n], ctx[n + 1]} }; ___t.x; }), \ + (void)0))))) + +#define ___bpf_ctx_arg0(n, args...) +#define ___bpf_ctx_arg1(n, t, x) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt1(t, x)) +#define ___bpf_ctx_arg2(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt2(t, x, args)) ___bpf_ctx_arg1(n, args) +#define ___bpf_ctx_arg3(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt3(t, x, args)) ___bpf_ctx_arg2(n, args) +#define ___bpf_ctx_arg4(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt4(t, x, args)) ___bpf_ctx_arg3(n, args) +#define ___bpf_ctx_arg5(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt5(t, x, args)) ___bpf_ctx_arg4(n, args) +#define ___bpf_ctx_arg6(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt6(t, x, args)) ___bpf_ctx_arg5(n, args) +#define ___bpf_ctx_arg7(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt7(t, x, args)) ___bpf_ctx_arg6(n, args) +#define ___bpf_ctx_arg8(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt8(t, x, args)) ___bpf_ctx_arg7(n, args) +#define ___bpf_ctx_arg9(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt9(t, x, args)) ___bpf_ctx_arg8(n, args) +#define ___bpf_ctx_arg10(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt10(t, x, args)) ___bpf_ctx_arg9(n, args) +#define ___bpf_ctx_arg11(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt11(t, x, args)) ___bpf_ctx_arg10(n, args) +#define ___bpf_ctx_arg12(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt12(t, x, args)) ___bpf_ctx_arg11(n, args) +#define ___bpf_ctx_arg(args...) ___bpf_apply(___bpf_ctx_arg, ___bpf_narg2(args))(___bpf_reg_cnt(args), args) + +#define ___bpf_ctx_decl0() +#define ___bpf_ctx_decl1(t, x) , t x +#define ___bpf_ctx_decl2(t, x, args...) , t x ___bpf_ctx_decl1(args) +#define ___bpf_ctx_decl3(t, x, args...) , t x ___bpf_ctx_decl2(args) +#define ___bpf_ctx_decl4(t, x, args...) , t x ___bpf_ctx_decl3(args) +#define ___bpf_ctx_decl5(t, x, args...) , t x ___bpf_ctx_decl4(args) +#define ___bpf_ctx_decl6(t, x, args...) , t x ___bpf_ctx_decl5(args) +#define ___bpf_ctx_decl7(t, x, args...) , t x ___bpf_ctx_decl6(args) +#define ___bpf_ctx_decl8(t, x, args...) , t x ___bpf_ctx_decl7(args) +#define ___bpf_ctx_decl9(t, x, args...) , t x ___bpf_ctx_decl8(args) +#define ___bpf_ctx_decl10(t, x, args...) , t x ___bpf_ctx_decl9(args) +#define ___bpf_ctx_decl11(t, x, args...) , t x ___bpf_ctx_decl10(args) +#define ___bpf_ctx_decl12(t, x, args...) , t x ___bpf_ctx_decl11(args) +#define ___bpf_ctx_decl(args...) ___bpf_apply(___bpf_ctx_decl, ___bpf_narg2(args))(args) + +/* + * BPF_PROG2 is an enhanced version of BPF_PROG in order to handle struct + * arguments. Since each struct argument might take one or two u64 values + * in the trampoline stack, argument type size is needed to place proper number + * of u64 values for each argument. Therefore, BPF_PROG2 has different + * syntax from BPF_PROG. For example, for the following BPF_PROG syntax: + * + * int BPF_PROG(test2, int a, int b) { ... } + * + * the corresponding BPF_PROG2 syntax is: + * + * int BPF_PROG2(test2, int, a, int, b) { ... } + * + * where type and the corresponding argument name are separated by comma. + * + * Use BPF_PROG2 macro if one of the arguments might be a struct/union larger + * than 8 bytes: + * + * int BPF_PROG2(test_struct_arg, struct bpf_testmod_struct_arg_1, a, int, b, + * int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret) + * { + * // access a, b, c, d, e, and ret directly + * ... + * } + */ +#define BPF_PROG2(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + return ____##name(ctx ___bpf_ctx_arg(args)); \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) + struct pt_regs; #define ___bpf_kprobe_args0() ctx diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 361131518d63..d88647da2c7f 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4642,20 +4642,17 @@ static int btf_dedup_remap_types(struct btf_dedup *d) */ struct btf *btf__load_vmlinux_btf(void) { - struct { - const char *path_fmt; - bool raw_btf; - } locations[] = { + const char *locations[] = { /* try canonical vmlinux BTF through sysfs first */ - { "/sys/kernel/btf/vmlinux", true /* raw BTF */ }, - /* fall back to trying to find vmlinux ELF on disk otherwise */ - { "/boot/vmlinux-%1$s" }, - { "/lib/modules/%1$s/vmlinux-%1$s" }, - { "/lib/modules/%1$s/build/vmlinux" }, - { "/usr/lib/modules/%1$s/kernel/vmlinux" }, - { "/usr/lib/debug/boot/vmlinux-%1$s" }, - { "/usr/lib/debug/boot/vmlinux-%1$s.debug" }, - { "/usr/lib/debug/lib/modules/%1$s/vmlinux" }, + "/sys/kernel/btf/vmlinux", + /* fall back to trying to find vmlinux on disk otherwise */ + "/boot/vmlinux-%1$s", + "/lib/modules/%1$s/vmlinux-%1$s", + "/lib/modules/%1$s/build/vmlinux", + "/usr/lib/modules/%1$s/kernel/vmlinux", + "/usr/lib/debug/boot/vmlinux-%1$s", + "/usr/lib/debug/boot/vmlinux-%1$s.debug", + "/usr/lib/debug/lib/modules/%1$s/vmlinux", }; char path[PATH_MAX + 1]; struct utsname buf; @@ -4665,15 +4662,12 @@ struct btf *btf__load_vmlinux_btf(void) uname(&buf); for (i = 0; i < ARRAY_SIZE(locations); i++) { - snprintf(path, PATH_MAX, locations[i].path_fmt, buf.release); + snprintf(path, PATH_MAX, locations[i], buf.release); - if (access(path, R_OK)) + if (faccessat(AT_FDCWD, path, R_OK, AT_EACCESS)) continue; - if (locations[i].raw_btf) - btf = btf__parse_raw(path); - else - btf = btf__parse_elf(path, NULL); + btf = btf__parse(path, NULL); err = libbpf_get_error(btf); pr_debug("loading kernel BTF '%s': %d\n", path, err); if (err) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index ae543144ee30..8e6880d91c84 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -486,6 +486,8 @@ static inline struct btf_enum *btf_enum(const struct btf_type *t) return (struct btf_enum *)(t + 1); } +struct btf_enum64; + static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) { return (struct btf_enum64 *)(t + 1); @@ -493,7 +495,28 @@ static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) static inline __u64 btf_enum64_value(const struct btf_enum64 *e) { - return ((__u64)e->val_hi32 << 32) | e->val_lo32; + /* struct btf_enum64 is introduced in Linux 6.0, which is very + * bleeding-edge. Here we are avoiding relying on struct btf_enum64 + * definition coming from kernel UAPI headers to support wider range + * of system-wide kernel headers. + * + * Given this header can be also included from C++ applications, that + * further restricts C tricks we can use (like using compatible + * anonymous struct). So just treat struct btf_enum64 as + * a three-element array of u32 and access second (lo32) and third + * (hi32) elements directly. + * + * For reference, here is a struct btf_enum64 definition: + * + * const struct btf_enum64 { + * __u32 name_off; + * __u32 val_lo32; + * __u32 val_hi32; + * }; + */ + const __u32 *e64 = (const __u32 *)e; + + return ((__u64)e64[2] << 32) | e64[1]; } static inline struct btf_member *btf_members(const struct btf_type *t) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 627edb5bb6de..4221f73a74d0 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -2385,7 +2385,7 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0); /* default indent string is a tab */ - if (!opts->indent_str) + if (!OPTS_GET(opts, indent_str, NULL)) d->typed_dump->indent_str[0] = '\t'; else libbpf_strlcpy(d->typed_dump->indent_str, opts->indent_str, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3ad139285fad..184ce1684dcd 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -163,6 +163,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", }; static const char * const prog_type_name[] = { @@ -883,7 +884,7 @@ __u32 get_kernel_version(void) __u32 major, minor, patch; struct utsname info; - if (access(ubuntu_kver_file, R_OK) == 0) { + if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) == 0) { FILE *f; f = fopen(ubuntu_kver_file, "r"); @@ -2096,19 +2097,30 @@ static bool get_map_field_int(const char *map_name, const struct btf *btf, return true; } +static int pathname_concat(char *buf, size_t buf_sz, const char *path, const char *name) +{ + int len; + + len = snprintf(buf, buf_sz, "%s/%s", path, name); + if (len < 0) + return -EINVAL; + if (len >= buf_sz) + return -ENAMETOOLONG; + + return 0; +} + static int build_map_pin_path(struct bpf_map *map, const char *path) { char buf[PATH_MAX]; - int len; + int err; if (!path) path = "/sys/fs/bpf"; - len = snprintf(buf, PATH_MAX, "%s/%s", path, bpf_map__name(map)); - if (len < 0) - return -EINVAL; - else if (len >= PATH_MAX) - return -ENAMETOOLONG; + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return err; return bpf_map__set_pin_path(map, buf); } @@ -2372,6 +2384,12 @@ static size_t adjust_ringbuf_sz(size_t sz) return sz; } +static bool map_is_ringbuf(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_RINGBUF || + map->def.type == BPF_MAP_TYPE_USER_RINGBUF; +} + static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) { map->def.type = def->map_type; @@ -2386,7 +2404,7 @@ static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def map->btf_value_type_id = def->value_type_id; /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ - if (map->def.type == BPF_MAP_TYPE_RINGBUF) + if (map_is_ringbuf(map)) map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); if (def->parts & MAP_DEF_MAP_TYPE) @@ -4369,7 +4387,7 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) map->def.max_entries = max_entries; /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ - if (map->def.type == BPF_MAP_TYPE_RINGBUF) + if (map_is_ringbuf(map)) map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); return 0; @@ -7961,17 +7979,9 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) continue; if (path) { - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, - bpf_map__name(map)); - if (len < 0) { - err = -EINVAL; - goto err_unpin_maps; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) goto err_unpin_maps; - } sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { @@ -8009,14 +8019,9 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) char buf[PATH_MAX]; if (path) { - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, - bpf_map__name(map)); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return libbpf_err(err); sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { @@ -8034,6 +8039,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) int bpf_object__pin_programs(struct bpf_object *obj, const char *path) { struct bpf_program *prog; + char buf[PATH_MAX]; int err; if (!obj) @@ -8045,17 +8051,9 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) } bpf_object__for_each_program(prog, obj) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); - if (len < 0) { - err = -EINVAL; - goto err_unpin_programs; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) goto err_unpin_programs; - } err = bpf_program__pin(prog, buf); if (err) @@ -8066,13 +8064,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) err_unpin_programs: while ((prog = bpf_object__prev_program(obj, prog))) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); - if (len < 0) - continue; - else if (len >= PATH_MAX) + if (pathname_concat(buf, sizeof(buf), path, prog->name)) continue; bpf_program__unpin(prog, buf); @@ -8091,13 +8083,10 @@ int bpf_object__unpin_programs(struct bpf_object *obj, const char *path) bpf_object__for_each_program(prog, obj) { char buf[PATH_MAX]; - int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + return libbpf_err(err); err = bpf_program__unpin(prog, buf); if (err) @@ -9084,11 +9073,15 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac int err = 0; /* BPF program's BTF ID */ - if (attach_prog_fd) { + if (prog->type == BPF_PROG_TYPE_EXT || attach_prog_fd) { + if (!attach_prog_fd) { + pr_warn("prog '%s': attach program FD is not set\n", prog->name); + return -EINVAL; + } err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); if (err < 0) { - pr_warn("failed to find BPF program (FD %d) BTF ID for '%s': %d\n", - attach_prog_fd, attach_name, err); + pr_warn("prog '%s': failed to find BPF program (FD %d) BTF ID for '%s': %d\n", + prog->name, attach_prog_fd, attach_name, err); return err; } *btf_obj_fd = 0; @@ -9105,7 +9098,8 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); } if (err) { - pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err); + pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %d\n", + prog->name, attach_name, err); return err; } return 0; @@ -9910,7 +9904,7 @@ static bool use_debugfs(void) static int has_debugfs = -1; if (has_debugfs < 0) - has_debugfs = access(DEBUGFS, F_OK) == 0; + has_debugfs = faccessat(AT_FDCWD, DEBUGFS, F_OK, AT_EACCESS) == 0; return has_debugfs == 1; } @@ -10727,7 +10721,7 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz) continue; snprintf(result, result_sz, "%.*s/%s", seg_len, s, file); /* ensure it has required permissions */ - if (access(result, perm) < 0) + if (faccessat(AT_FDCWD, result, perm, AT_EACCESS) < 0) continue; pr_debug("resolved '%s' to '%s'\n", file, result); return 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 88a1ac34b12a..eee883f007f9 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -118,7 +118,9 @@ struct bpf_object_open_opts { * auto-pinned to that path on load; defaults to "/sys/fs/bpf". */ const char *pin_root_path; - long :0; + + __u32 :32; /* stub out now removed attach_prog_fd */ + /* Additional kernel config content that augments and overrides * system Kconfig for CONFIG_xxx externs. */ @@ -1011,6 +1013,7 @@ LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook, /* Ring buffer APIs */ struct ring_buffer; +struct user_ring_buffer; typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); @@ -1030,6 +1033,112 @@ LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); +struct user_ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define user_ring_buffer_opts__last_field sz + +/* @brief **user_ring_buffer__new()** creates a new instance of a user ring + * buffer. + * + * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map. + * @param opts Options for how the ring buffer should be created. + * @return A user ring buffer on success; NULL and errno being set on a + * failure. + */ +LIBBPF_API struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); + +/* @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the + * user ring buffer. + * @param rb A pointer to a user ring buffer. + * @param size The size of the sample, in bytes. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize accessing + * this function if there are multiple producers. If a size is requested that + * is larger than the size of the entire ring buffer, errno will be set to + * E2BIG and NULL is returned. If the ring buffer could accommodate the size, + * but currently does not have enough space, errno is set to ENOSPC and NULL is + * returned. + * + * After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the kernel. Otherwise, + * the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); + +/* @brief **user_ring_buffer__reserve_blocking()** reserves a record in the + * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes + * available. + * @param rb The user ring buffer. + * @param size The size of the sample, in bytes. + * @param timeout_ms The amount of time, in milliseconds, for which the caller + * should block when waiting for a sample. -1 causes the caller to block + * indefinitely. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize + * accessing this function if there are multiple producers + * + * If **timeout_ms** is -1, the function will block indefinitely until a sample + * becomes available. Otherwise, **timeout_ms** must be non-negative, or errno + * is set to EINVAL, and NULL is returned. If **timeout_ms** is 0, no blocking + * will occur and the function will return immediately after attempting to + * reserve a sample. + * + * If **size** is larger than the size of the entire ring buffer, errno is set + * to E2BIG and NULL is returned. If the ring buffer could accommodate + * **size**, but currently does not have enough space, the caller will block + * until at most **timeout_ms** has elapsed. If insufficient space is available + * at that time, errno is set to ENOSPC, and NULL is returned. + * + * The kernel guarantees that it will wake up this thread to check if + * sufficient space is available in the ring buffer at least once per + * invocation of the **bpf_ringbuf_drain()** helper function, provided that at + * least one sample is consumed, and the BPF program did not invoke the + * function with BPF_RB_NO_WAKEUP. A wakeup may occur sooner than that, but the + * kernel does not guarantee this. If the helper function is invoked with + * BPF_RB_FORCE_WAKEUP, a wakeup event will be sent even if no sample is + * consumed. + * + * When a sample of size **size** is found within **timeout_ms**, a pointer to + * the sample is returned. After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the ring buffer. + * Otherwise, the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, + __u32 size, + int timeout_ms); + +/* @brief **user_ring_buffer__submit()** submits a previously reserved sample + * into the ring buffer. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); + +/* @brief **user_ring_buffer__discard()** discards a previously reserved sample. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample); + +/* @brief **user_ring_buffer__free()** frees a ring buffer that was previously + * created with **user_ring_buffer__new()**. + * @param rb The user ring buffer being freed. + */ +LIBBPF_API void user_ring_buffer__free(struct user_ring_buffer *rb); + /* Perf buffer APIs */ struct perf_buffer; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 2b928dc21af0..c1d6aa7c82b6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -368,3 +368,13 @@ LIBBPF_1.0.0 { libbpf_bpf_prog_type_str; perf_buffer__buffer; }; + +LIBBPF_1.1.0 { + global: + user_ring_buffer__discard; + user_ring_buffer__free; + user_ring_buffer__new; + user_ring_buffer__reserve; + user_ring_buffer__reserve_blocking; + user_ring_buffer__submit; +} LIBBPF_1.0.0; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 6d495656f554..f3a8e8e74eb8 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -231,6 +231,7 @@ static int probe_map_create(enum bpf_map_type map_type) return btf_fd; break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: key_size = 0; value_size = 0; max_entries = 4096; diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index 2fb2f4290080..e944f5bce728 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 0 +#define LIBBPF_MINOR_VERSION 1 #endif /* __LIBBPF_VERSION_H */ diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c index f57e77a6e40f..3900d052ed19 100644 --- a/tools/lib/bpf/nlattr.c +++ b/tools/lib/bpf/nlattr.c @@ -32,7 +32,7 @@ static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) static int nla_ok(const struct nlattr *nla, int remaining) { - return remaining >= sizeof(*nla) && + return remaining >= (int)sizeof(*nla) && nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 8bc117bcc7bc..d285171d4b69 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -16,6 +16,7 @@ #include <asm/barrier.h> #include <sys/mman.h> #include <sys/epoll.h> +#include <time.h> #include "libbpf.h" #include "libbpf_internal.h" @@ -39,6 +40,23 @@ struct ring_buffer { int ring_cnt; }; +struct user_ring_buffer { + struct epoll_event event; + unsigned long *consumer_pos; + unsigned long *producer_pos; + void *data; + unsigned long mask; + size_t page_size; + int map_fd; + int epoll_fd; +}; + +/* 8-byte ring buffer header structure */ +struct ringbuf_hdr { + __u32 len; + __u32 pad; +}; + static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) { if (r->consumer_pos) { @@ -300,3 +318,256 @@ int ring_buffer__epoll_fd(const struct ring_buffer *rb) { return rb->epoll_fd; } + +static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) +{ + if (rb->consumer_pos) { + munmap(rb->consumer_pos, rb->page_size); + rb->consumer_pos = NULL; + } + if (rb->producer_pos) { + munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1)); + rb->producer_pos = NULL; + } +} + +void user_ring_buffer__free(struct user_ring_buffer *rb) +{ + if (!rb) + return; + + user_ringbuf_unmap_ring(rb); + + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb); +} + +static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + void *tmp; + struct epoll_event *rb_epoll; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_USER_RINGBUF) { + pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd); + return -EINVAL; + } + + rb->map_fd = map_fd; + rb->mask = info.max_entries - 1; + + /* Map read-only consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + rb->consumer_pos = tmp; + + /* Map read-write the producer page and data pages. We map the data + * region as twice the total size of the ring buffer to allow the + * simple reading and writing of samples that wrap around the end of + * the buffer. See the kernel implementation for details. + */ + tmp = mmap(NULL, rb->page_size + 2 * info.max_entries, + PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->producer_pos = tmp; + rb->data = tmp + rb->page_size; + + rb_epoll = &rb->event; + rb_epoll->events = EPOLLOUT; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) { + err = -errno; + pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); + return err; + } + + return 0; +} + +struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts) +{ + struct user_ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, user_ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("user ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = user_ringbuf_map(rb, map_fd); + if (err) + goto err_out; + + return rb; + +err_out: + user_ring_buffer__free(rb); + return errno = -err, NULL; +} + +static void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard) +{ + __u32 new_len; + struct ringbuf_hdr *hdr; + uintptr_t hdr_offset; + + hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ; + hdr = rb->data + (hdr_offset & rb->mask); + + new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + __atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL); +} + +void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, true); +} + +void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, false); +} + +void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size) +{ + __u32 avail_size, total_size, max_size; + /* 64-bit to avoid overflow in case of extreme application behavior */ + __u64 cons_pos, prod_pos; + struct ringbuf_hdr *hdr; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + cons_pos = smp_load_acquire(rb->consumer_pos); + /* Synchronizes with smp_store_release() in user_ringbuf_commit() */ + prod_pos = smp_load_acquire(rb->producer_pos); + + max_size = rb->mask + 1; + avail_size = max_size - (prod_pos - cons_pos); + /* Round up total size to a multiple of 8. */ + total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8; + + if (total_size > max_size) + return errno = E2BIG, NULL; + + if (avail_size < total_size) + return errno = ENOSPC, NULL; + + hdr = rb->data + (prod_pos & rb->mask); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pad = 0; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + smp_store_release(rb->producer_pos, prod_pos + total_size); + + return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask); +} + +static __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end) +{ + __u64 start_ns, end_ns, ns_per_s = 1000000000; + + start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec; + end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec; + + return end_ns - start_ns; +} + +void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms) +{ + void *sample; + int err, ms_remaining = timeout_ms; + struct timespec start; + + if (timeout_ms < 0 && timeout_ms != -1) + return errno = EINVAL, NULL; + + if (timeout_ms != -1) { + err = clock_gettime(CLOCK_MONOTONIC, &start); + if (err) + return NULL; + } + + do { + int cnt, ms_elapsed; + struct timespec curr; + __u64 ns_per_ms = 1000000; + + sample = user_ring_buffer__reserve(rb, size); + if (sample) + return sample; + else if (errno != ENOSPC) + return NULL; + + /* The kernel guarantees at least one event notification + * delivery whenever at least one sample is drained from the + * ring buffer in an invocation to bpf_ringbuf_drain(). Other + * additional events may be delivered at any time, but only one + * event is guaranteed per bpf_ringbuf_drain() invocation, + * provided that a sample is drained, and the BPF program did + * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If + * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a + * wakeup event will be delivered even if no samples are + * drained. + */ + cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining); + if (cnt < 0) + return NULL; + + if (timeout_ms == -1) + continue; + + err = clock_gettime(CLOCK_MONOTONIC, &curr); + if (err) + return NULL; + + ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms; + ms_remaining = timeout_ms - ms_elapsed; + } while (ms_remaining > 0); + + /* Try one more time to reserve a sample after the specified timeout has elapsed. */ + return user_ring_buffer__reserve(rb, size); +} diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index d18e37982344..e83b497c2245 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -282,7 +282,7 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj) * If this is not supported, USDTs with semaphores will not be supported. * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") */ - man->has_sema_refcnt = access(ref_ctr_sysfs_path, F_OK) == 0; + man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == 0; return man; } |