diff options
158 files changed, 9400 insertions, 2213 deletions
diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index a5ab00ac0b14..997560abadab 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -5,7 +5,11 @@ BPF Instruction Set Architecture (ISA) ====================================== -This document specifies the BPF instruction set architecture (ISA). +eBPF (which is no longer an acronym for anything), also commonly +referred to as BPF, is a technology with origins in the Linux kernel +that can run untrusted programs in a privileged context such as an +operating system kernel. This document specifies the BPF instruction +set architecture (ISA). Documentation conventions ========================= @@ -43,7 +47,7 @@ a type's signedness (`S`) and bit width (`N`), respectively. ===== ========= For example, `u32` is a type whose valid values are all the 32-bit unsigned -numbers and `s16` is a types whose valid values are all the 16-bit signed +numbers and `s16` is a type whose valid values are all the 16-bit signed numbers. Functions @@ -108,7 +112,7 @@ conformance group means it must support all instructions in that conformance group. The use of named conformance groups enables interoperability between a runtime -that executes instructions, and tools as such compilers that generate +that executes instructions, and tools such as compilers that generate instructions for the runtime. Thus, capability discovery in terms of conformance groups might be done manually by users or automatically by tools. @@ -181,10 +185,13 @@ A basic instruction is encoded as follows:: (`64-bit immediate instructions`_ reuse this field for other purposes) **dst_reg** - destination register number (0-10) + destination register number (0-10), unless otherwise specified + (future instructions might reuse this field for other purposes) **offset** - signed integer offset used with pointer arithmetic + signed integer offset used with pointer arithmetic, except where + otherwise specified (some arithmetic instructions reuse this field + for other purposes) **imm** signed integer immediate value @@ -228,10 +235,12 @@ This is depicted in the following figure:: operation to perform, encoded as explained above **regs** - The source and destination register numbers, encoded as explained above + The source and destination register numbers (unless otherwise + specified), encoded as explained above **offset** - signed integer offset used with pointer arithmetic + signed integer offset used with pointer arithmetic, unless + otherwise specified **imm** signed integer immediate value @@ -342,8 +351,8 @@ where '(u32)' indicates that the upper 32 bits are zeroed. dst = dst ^ imm -Note that most instructions have instruction offset of 0. Only three instructions -(``SDIV``, ``SMOD``, ``MOVSX``) have a non-zero offset. +Note that most arithmetic instructions have 'offset' set to 0. Only three instructions +(``SDIV``, ``SMOD``, ``MOVSX``) have a non-zero 'offset'. Division, multiplication, and modulo operations for ``ALU`` are part of the "divmul32" conformance group, and division, multiplication, and @@ -365,15 +374,15 @@ Note that there are varying definitions of the signed modulo operation when the dividend or divisor are negative, where implementations often vary by language such that Python, Ruby, etc. differ from C, Go, Java, etc. This specification requires that signed modulo use truncated division -(where -13 % 3 == -1) as implemented in C, Go, etc.: +(where -13 % 3 == -1) as implemented in C, Go, etc.:: a % n = a - n * trunc(a / n) The ``MOVSX`` instruction does a move operation with sign extension. -``{MOVSX, X, ALU}`` :term:`sign extends<Sign Extend>` 8-bit and 16-bit operands into 32 -bit operands, and zeroes the remaining upper 32 bits. +``{MOVSX, X, ALU}`` :term:`sign extends<Sign Extend>` 8-bit and 16-bit operands into +32-bit operands, and zeroes the remaining upper 32 bits. ``{MOVSX, X, ALU64}`` :term:`sign extends<Sign Extend>` 8-bit, 16-bit, and 32-bit -operands into 64 bit operands. Unlike other arithmetic instructions, +operands into 64-bit operands. Unlike other arithmetic instructions, ``MOVSX`` is only defined for register source operands (``X``). The ``NEG`` instruction is only defined when the source bit is clear @@ -411,19 +420,19 @@ conformance group. Examples: -``{END, TO_LE, ALU}`` with imm = 16/32/64 means:: +``{END, TO_LE, ALU}`` with 'imm' = 16/32/64 means:: dst = htole16(dst) dst = htole32(dst) dst = htole64(dst) -``{END, TO_BE, ALU}`` with imm = 16/32/64 means:: +``{END, TO_BE, ALU}`` with 'imm' = 16/32/64 means:: dst = htobe16(dst) dst = htobe32(dst) dst = htobe64(dst) -``{END, TO_LE, ALU64}`` with imm = 16/32/64 means:: +``{END, TO_LE, ALU64}`` with 'imm' = 16/32/64 means:: dst = bswap16(dst) dst = bswap32(dst) @@ -438,27 +447,33 @@ otherwise identical operations, and indicates the base64 conformance group unless otherwise specified. The 'code' field encodes the operation as below: -======== ===== ======= =============================== =================================================== -code value src_reg description notes -======== ===== ======= =============================== =================================================== -JA 0x0 0x0 PC += offset {JA, K, JMP} only -JA 0x0 0x0 PC += imm {JA, K, JMP32} only +======== ===== ======= ================================= =================================================== +code value src_reg description notes +======== ===== ======= ================================= =================================================== +JA 0x0 0x0 PC += offset {JA, K, JMP} only +JA 0x0 0x0 PC += imm {JA, K, JMP32} only JEQ 0x1 any PC += offset if dst == src -JGT 0x2 any PC += offset if dst > src unsigned -JGE 0x3 any PC += offset if dst >= src unsigned +JGT 0x2 any PC += offset if dst > src unsigned +JGE 0x3 any PC += offset if dst >= src unsigned JSET 0x4 any PC += offset if dst & src JNE 0x5 any PC += offset if dst != src -JSGT 0x6 any PC += offset if dst > src signed -JSGE 0x7 any PC += offset if dst >= src signed -CALL 0x8 0x0 call helper function by address {CALL, K, JMP} only, see `Helper functions`_ -CALL 0x8 0x1 call PC += imm {CALL, K, JMP} only, see `Program-local functions`_ -CALL 0x8 0x2 call helper function by BTF ID {CALL, K, JMP} only, see `Helper functions`_ -EXIT 0x9 0x0 return {CALL, K, JMP} only -JLT 0xa any PC += offset if dst < src unsigned -JLE 0xb any PC += offset if dst <= src unsigned -JSLT 0xc any PC += offset if dst < src signed -JSLE 0xd any PC += offset if dst <= src signed -======== ===== ======= =============================== =================================================== +JSGT 0x6 any PC += offset if dst > src signed +JSGE 0x7 any PC += offset if dst >= src signed +CALL 0x8 0x0 call helper function by static ID {CALL, K, JMP} only, see `Helper functions`_ +CALL 0x8 0x1 call PC += imm {CALL, K, JMP} only, see `Program-local functions`_ +CALL 0x8 0x2 call helper function by BTF ID {CALL, K, JMP} only, see `Helper functions`_ +EXIT 0x9 0x0 return {CALL, K, JMP} only +JLT 0xa any PC += offset if dst < src unsigned +JLE 0xb any PC += offset if dst <= src unsigned +JSLT 0xc any PC += offset if dst < src signed +JSLE 0xd any PC += offset if dst <= src signed +======== ===== ======= ================================= =================================================== + +where 'PC' denotes the program counter, and the offset to increment by +is in units of 64-bit instructions relative to the instruction following +the jump instruction. Thus 'PC += 1' skips execution of the next +instruction if it's a basic instruction or results in undefined behavior +if the next instruction is a 128-bit wide instruction. The BPF program needs to store the return value into register R0 before doing an ``EXIT``. @@ -475,7 +490,7 @@ where 's>=' indicates a signed '>=' comparison. gotol +imm -where 'imm' means the branch offset comes from insn 'imm' field. +where 'imm' means the branch offset comes from the 'imm' field. Note that there are two flavors of ``JA`` instructions. The ``JMP`` class permits a 16-bit jump offset specified by the 'offset' @@ -493,26 +508,26 @@ Helper functions Helper functions are a concept whereby BPF programs can call into a set of function calls exposed by the underlying platform. -Historically, each helper function was identified by an address -encoded in the imm field. The available helper functions may differ -for each program type, but address values are unique across all program types. +Historically, each helper function was identified by a static ID +encoded in the 'imm' field. The available helper functions may differ +for each program type, but static IDs are unique across all program types. Platforms that support the BPF Type Format (BTF) support identifying -a helper function by a BTF ID encoded in the imm field, where the BTF ID +a helper function by a BTF ID encoded in the 'imm' field, where the BTF ID identifies the helper name and type. Program-local functions ~~~~~~~~~~~~~~~~~~~~~~~ Program-local functions are functions exposed by the same BPF program as the caller, and are referenced by offset from the call instruction, similar to -``JA``. The offset is encoded in the imm field of the call instruction. -A ``EXIT`` within the program-local function will return to the caller. +``JA``. The offset is encoded in the 'imm' field of the call instruction. +An ``EXIT`` within the program-local function will return to the caller. Load and store instructions =========================== For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the -8-bit 'opcode' field is divided as:: +8-bit 'opcode' field is divided as follows:: +-+-+-+-+-+-+-+-+ |mode |sz |class| @@ -580,7 +595,7 @@ instructions that transfer data between a register and memory. dst = *(signed size *) (src + offset) -Where size is one of: ``B``, ``H``, or ``W``, and +Where '<size>' is one of: ``B``, ``H``, or ``W``, and 'signed size' is one of: s8, s16, or s32. Atomic operations @@ -662,11 +677,11 @@ src_reg pseudocode imm type dst type ======= ========================================= =========== ============== 0x0 dst = (next_imm << 32) | imm integer integer 0x1 dst = map_by_fd(imm) map fd map -0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data pointer -0x3 dst = var_addr(imm) variable id data pointer -0x4 dst = code_addr(imm) integer code pointer +0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data address +0x3 dst = var_addr(imm) variable id data address +0x4 dst = code_addr(imm) integer code address 0x5 dst = map_by_idx(imm) map index map -0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data pointer +0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data address ======= ========================================= =========== ============== where diff --git a/MAINTAINERS b/MAINTAINERS index ab89edc6974d..943921d642ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3822,6 +3822,14 @@ F: kernel/bpf/tnum.c F: kernel/bpf/trampoline.c F: kernel/bpf/verifier.c +BPF [CRYPTO] +M: Vadim Fedorenko <vadim.fedorenko@linux.dev> +L: bpf@vger.kernel.org +S: Maintained +F: crypto/bpf_crypto_skcipher.c +F: include/linux/bpf_crypto.h +F: kernel/bpf/crypto.c + BPF [DOCUMENTATION] (Related to Standardization) R: David Vernet <void@manifault.com> L: bpf@vger.kernel.org diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 82d6c562e8c7..76b91f36c729 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -29,6 +29,7 @@ #define TCALL_CNT (MAX_BPF_JIT_REG + 2) #define TMP_REG_3 (MAX_BPF_JIT_REG + 3) #define FP_BOTTOM (MAX_BPF_JIT_REG + 4) +#define ARENA_VM_START (MAX_BPF_JIT_REG + 5) #define check_imm(bits, imm) do { \ if ((((imm) > 0) && ((imm) >> (bits))) || \ @@ -67,6 +68,8 @@ static const int bpf2a64[] = { /* temporary register for blinding constants */ [BPF_REG_AX] = A64_R(9), [FP_BOTTOM] = A64_R(27), + /* callee saved register for kern_vm_start address */ + [ARENA_VM_START] = A64_R(28), }; struct jit_ctx { @@ -79,6 +82,7 @@ struct jit_ctx { __le32 *ro_image; u32 stack_size; int fpb_offset; + u64 user_vm_start; }; struct bpf_plt { @@ -295,7 +299,7 @@ static bool is_lsi_offset(int offset, int scale) #define PROLOGUE_OFFSET (BTI_INSNS + 2 + PAC_INSNS + 8) static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, - bool is_exception_cb) + bool is_exception_cb, u64 arena_vm_start) { const struct bpf_prog *prog = ctx->prog; const bool is_main_prog = !bpf_is_subprog(prog); @@ -306,6 +310,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, const u8 fp = bpf2a64[BPF_REG_FP]; const u8 tcc = bpf2a64[TCALL_CNT]; const u8 fpb = bpf2a64[FP_BOTTOM]; + const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const int idx0 = ctx->idx; int cur_offset; @@ -411,6 +416,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, /* Set up function call stack */ emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + + if (arena_vm_start) + emit_a64_mov_i64(arena_vm_base, arena_vm_start, ctx); + return 0; } @@ -738,6 +747,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb) #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) +#define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) @@ -745,7 +755,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); - regs->regs[dst_reg] = 0; + if (dst_reg != DONT_CLEAR) + regs->regs[dst_reg] = 0; regs->pc = (unsigned long)&ex->fixup - offset; return true; } @@ -765,7 +776,8 @@ static int add_exception_handler(const struct bpf_insn *insn, return 0; if (BPF_MODE(insn->code) != BPF_PROBE_MEM && - BPF_MODE(insn->code) != BPF_PROBE_MEMSX) + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32) return 0; if (!ctx->prog->aux->extable || @@ -810,6 +822,9 @@ static int add_exception_handler(const struct bpf_insn *insn, ex->insn = ins_offset; + if (BPF_CLASS(insn->code) != BPF_LDX) + dst_reg = DONT_CLEAR; + ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) | FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); @@ -829,12 +844,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool extra_pass) { const u8 code = insn->code; - const u8 dst = bpf2a64[insn->dst_reg]; - const u8 src = bpf2a64[insn->src_reg]; + u8 dst = bpf2a64[insn->dst_reg]; + u8 src = bpf2a64[insn->src_reg]; const u8 tmp = bpf2a64[TMP_REG_1]; const u8 tmp2 = bpf2a64[TMP_REG_2]; const u8 fp = bpf2a64[BPF_REG_FP]; const u8 fpb = bpf2a64[FP_BOTTOM]; + const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const s16 off = insn->off; const s32 imm = insn->imm; const int i = insn - ctx->prog->insnsi; @@ -853,6 +869,15 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: case BPF_ALU64 | BPF_MOV | BPF_X: + if (insn_is_cast_user(insn)) { + emit(A64_MOV(0, tmp, src), ctx); // 32-bit mov clears the upper 32 bits + emit_a64_mov_i(0, dst, ctx->user_vm_start >> 32, ctx); + emit(A64_LSL(1, dst, dst, 32), ctx); + emit(A64_CBZ(1, tmp, 2), ctx); + emit(A64_ORR(1, tmp, dst, tmp), ctx); + emit(A64_MOV(1, dst, tmp), ctx); + break; + } switch (insn->off) { case 0: emit(A64_MOV(is64, dst, src), ctx); @@ -1237,7 +1262,15 @@ emit_cond_jmp: case BPF_LDX | BPF_PROBE_MEMSX | BPF_B: case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: - if (ctx->fpb_offset > 0 && src == fp) { + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit(A64_ADD(1, tmp2, src, arena_vm_base), ctx); + src = tmp2; + } + if (ctx->fpb_offset > 0 && src == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { src_adj = fpb; off_adj = off + ctx->fpb_offset; } else { @@ -1322,7 +1355,15 @@ emit_cond_jmp: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_DW: - if (ctx->fpb_offset > 0 && dst == fp) { + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit(A64_ADD(1, tmp2, dst, arena_vm_base), ctx); + dst = tmp2; + } + if (ctx->fpb_offset > 0 && dst == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { dst_adj = fpb; off_adj = off + ctx->fpb_offset; } else { @@ -1365,6 +1406,10 @@ emit_cond_jmp: } break; } + + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; break; /* STX: *(size *)(dst + off) = src */ @@ -1372,7 +1417,15 @@ emit_cond_jmp: case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_B: case BPF_STX | BPF_MEM | BPF_DW: - if (ctx->fpb_offset > 0 && dst == fp) { + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit(A64_ADD(1, tmp2, dst, arena_vm_base), ctx); + dst = tmp2; + } + if (ctx->fpb_offset > 0 && dst == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { dst_adj = fpb; off_adj = off + ctx->fpb_offset; } else { @@ -1413,6 +1466,10 @@ emit_cond_jmp: } break; } + + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; break; case BPF_STX | BPF_ATOMIC | BPF_W: @@ -1594,6 +1651,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bool tmp_blinded = false; bool extra_pass = false; struct jit_ctx ctx; + u64 arena_vm_start; u8 *image_ptr; u8 *ro_image_ptr; @@ -1611,6 +1669,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = tmp; } + arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); @@ -1641,6 +1700,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.fpb_offset = find_fpb_offset(prog); + ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); /* * 1. Initial fake pass to compute ctx->idx and ctx->offset. @@ -1648,7 +1708,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * BPF line info needs ctx->offset[i] to be the offset of * instruction[i] in jited image, so build prologue first. */ - if (build_prologue(&ctx, was_classic, prog->aux->exception_cb)) { + if (build_prologue(&ctx, was_classic, prog->aux->exception_cb, + arena_vm_start)) { prog = orig_prog; goto out_off; } @@ -1696,7 +1757,7 @@ skip_init_ctx: ctx.idx = 0; ctx.exentry_idx = 0; - build_prologue(&ctx, was_classic, prog->aux->exception_cb); + build_prologue(&ctx, was_classic, prog->aux->exception_cb, arena_vm_start); if (build_body(&ctx, extra_pass)) { prog = orig_prog; @@ -2461,6 +2522,11 @@ bool bpf_jit_supports_exceptions(void) return true; } +bool bpf_jit_supports_arena(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index f4b6b3b9edda..5fc374ed98ea 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -81,6 +81,8 @@ struct rv_jit_context { int nexentries; unsigned long flags; int stack_size; + u64 arena_vm_start; + u64 user_vm_start; }; /* Convert from ninsns to bytes. */ diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 1adf2f39ce59..15e482f2c657 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -18,6 +18,7 @@ #define RV_REG_TCC RV_REG_A6 #define RV_REG_TCC_SAVED RV_REG_S6 /* Store A6 in S6 if program do calls */ +#define RV_REG_ARENA RV_REG_S7 /* For storing arena_vm_start */ static const int regmap[] = { [BPF_REG_0] = RV_REG_A5, @@ -255,6 +256,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx); store_offset -= 8; } + if (ctx->arena_vm_start) { + emit_ld(RV_REG_ARENA, store_offset, RV_REG_SP, ctx); + store_offset -= 8; + } emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx); /* Set return value. */ @@ -548,6 +553,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) +#define REG_DONT_CLEAR_MARKER 0 /* RV_REG_ZERO unused in pt_regmap */ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) @@ -555,7 +561,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); - *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0; + if (regs_offset != REG_DONT_CLEAR_MARKER) + *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0; regs->epc = (unsigned long)&ex->fixup - offset; return true; @@ -572,7 +579,8 @@ static int add_exception_handler(const struct bpf_insn *insn, off_t fixup_offset; if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable || - (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX)) + (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32)) return 0; if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) @@ -1073,6 +1081,15 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: case BPF_ALU64 | BPF_MOV | BPF_X: + if (insn_is_cast_user(insn)) { + emit_mv(RV_REG_T1, rs, ctx); + emit_zextw(RV_REG_T1, RV_REG_T1, ctx); + emit_imm(rd, (ctx->user_vm_start >> 32) << 32, ctx); + emit(rv_beq(RV_REG_T1, RV_REG_ZERO, 4), ctx); + emit_or(RV_REG_T1, rd, RV_REG_T1, ctx); + emit_mv(rd, RV_REG_T1, ctx); + break; + } if (imm == 1) { /* Special mov32 for zext */ emit_zextw(rd, rd, ctx); @@ -1539,6 +1556,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_LDX | BPF_PROBE_MEMSX | BPF_B: case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: + /* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + RV_REG_ARENA + off) */ + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: { int insn_len, insns_start; bool sign_ext; @@ -1546,6 +1568,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, sign_ext = BPF_MODE(insn->code) == BPF_MEMSX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX; + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit_add(RV_REG_T2, rs, RV_REG_ARENA, ctx); + rs = RV_REG_T2; + } + switch (BPF_SIZE(code)) { case BPF_B: if (is_12b_int(off)) { @@ -1682,6 +1709,86 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); break; + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + { + int insn_len, insns_start; + + emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T3; + + /* Load imm to a register then store it */ + emit_imm(RV_REG_T1, imm, ctx); + + switch (BPF_SIZE(code)) { + case BPF_B: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit(rv_sb(rd, off, RV_REG_T1), ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_H: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit(rv_sh(rd, off, RV_REG_T1), ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_W: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_sw(rd, off, RV_REG_T1, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_DW: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_sd(rd, off, RV_REG_T1, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, + insn_len); + if (ret) + return ret; + + break; + } + /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_B: if (is_12b_int(off)) { @@ -1728,6 +1835,84 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit_atomic(rd, rs, off, imm, BPF_SIZE(code) == BPF_DW, ctx); break; + + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + { + int insn_len, insns_start; + + emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T2; + + switch (BPF_SIZE(code)) { + case BPF_B: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit(rv_sb(rd, off, rs), ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + insns_start = ctx->ninsns; + emit(rv_sb(RV_REG_T1, 0, rs), ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_H: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit(rv_sh(rd, off, rs), ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + insns_start = ctx->ninsns; + emit(rv_sh(RV_REG_T1, 0, rs), ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_W: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_sw(rd, off, rs, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + insns_start = ctx->ninsns; + emit_sw(RV_REG_T1, 0, rs, ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_DW: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_sd(rd, off, rs, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + insns_start = ctx->ninsns; + emit_sd(RV_REG_T1, 0, rs, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, + insn_len); + if (ret) + return ret; + + break; + } + default: pr_err("bpf-jit: unknown opcode %02x\n", code); return -EINVAL; @@ -1759,6 +1944,8 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog) stack_adjust += 8; if (seen_reg(RV_REG_S6, ctx)) stack_adjust += 8; + if (ctx->arena_vm_start) + stack_adjust += 8; stack_adjust = round_up(stack_adjust, 16); stack_adjust += bpf_stack_adjust; @@ -1810,6 +1997,10 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog) emit_sd(RV_REG_SP, store_offset, RV_REG_S6, ctx); store_offset -= 8; } + if (ctx->arena_vm_start) { + emit_sd(RV_REG_SP, store_offset, RV_REG_ARENA, ctx); + store_offset -= 8; + } emit_addi(RV_REG_FP, RV_REG_SP, stack_adjust, ctx); @@ -1823,6 +2014,9 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog) emit_mv(RV_REG_TCC_SAVED, RV_REG_TCC, ctx); ctx->stack_size = stack_adjust; + + if (ctx->arena_vm_start) + emit_imm(RV_REG_ARENA, ctx->arena_vm_start, ctx); } void bpf_jit_build_epilogue(struct rv_jit_context *ctx) @@ -1839,3 +2033,8 @@ bool bpf_jit_supports_ptr_xchg(void) { return true; } + +bool bpf_jit_supports_arena(void) +{ + return true; +} diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 6b3acac30c06..8a69d6d81e32 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -80,6 +80,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto skip_init_ctx; } + ctx->arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); + ctx->user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); ctx->prog = prog; ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL); if (!ctx->offset) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 788a3d6f6276..ff217cc35ce9 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -816,9 +816,10 @@ done: static void emit_mov_imm64(u8 **pprog, u32 dst_reg, const u32 imm32_hi, const u32 imm32_lo) { + u64 imm64 = ((u64)imm32_hi << 32) | (u32)imm32_lo; u8 *prog = *pprog; - if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { + if (is_uimm32(imm64)) { /* * For emitting plain u32, where sign bit must not be * propagated LLVM tends to load imm64 over mov32 @@ -826,6 +827,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, * 'mov %eax, imm32' instead. */ emit_mov_imm32(&prog, false, dst_reg, imm32_lo); + } else if (is_simm32(imm64)) { + emit_mov_imm32(&prog, true, dst_reg, imm32_lo); } else { /* movabsq rax, imm64 */ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); @@ -1169,6 +1172,54 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } +static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + EMIT1(0xF0); /* lock prefix */ + switch (size) { + case BPF_W: + EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg)); + break; + case BPF_DW: + EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg)); + break; + default: + pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n"); + return -EFAULT; + } + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + case BPF_AND: + case BPF_OR: + case BPF_XOR: + /* lock *(u32/u64*)(dst_reg + idx_reg + off) <op>= src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + idx_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + idx_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + idx_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off); + *pprog = prog; + return 0; +} + #define DONT_CLEAR 1 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) @@ -1382,6 +1433,16 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image maybe_emit_mod(&prog, AUX_REG, dst_reg, true); EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg)); break; + } else if (insn_is_mov_percpu_addr(insn)) { + /* mov <dst>, <src> (if necessary) */ + EMIT_mov(dst_reg, src_reg); +#ifdef CONFIG_SMP + /* add <dst>, gs:[<off>] */ + EMIT2(0x65, add_1mod(0x48, dst_reg)); + EMIT3(0x03, add_2reg(0x04, 0, dst_reg), 0x25); + EMIT((u32)(unsigned long)&this_cpu_off, 4); +#endif + break; } fallthrough; case BPF_ALU | BPF_MOV | BPF_X: @@ -1969,6 +2030,15 @@ populate_extable: return err; break; + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: + start_of_ldx = prog; + err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code), + dst_reg, src_reg, X86_REG_R12, insn->off); + if (err) + return err; + goto populate_extable; + /* call */ case BPF_JMP | BPF_CALL: { u8 *ip = image + addrs[i - 1]; @@ -3362,6 +3432,11 @@ bool bpf_jit_supports_subprog_tailcalls(void) return true; } +bool bpf_jit_supports_percpu_insn(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { @@ -3465,6 +3540,21 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) + return false; + } + return true; +} + bool bpf_jit_supports_ptr_xchg(void) { return true; diff --git a/crypto/Makefile b/crypto/Makefile index 408f0a1f9ab9..538124f8bf8a 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -20,6 +20,9 @@ crypto_skcipher-y += lskcipher.o crypto_skcipher-y += skcipher.o obj-$(CONFIG_CRYPTO_SKCIPHER2) += crypto_skcipher.o +ifeq ($(CONFIG_BPF_SYSCALL),y) +obj-$(CONFIG_CRYPTO_SKCIPHER2) += bpf_crypto_skcipher.o +endif obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o obj-$(CONFIG_CRYPTO_ECHAINIV) += echainiv.o diff --git a/crypto/bpf_crypto_skcipher.c b/crypto/bpf_crypto_skcipher.c new file mode 100644 index 000000000000..b5e657415770 --- /dev/null +++ b/crypto/bpf_crypto_skcipher.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta, Inc */ +#include <linux/types.h> +#include <linux/module.h> +#include <linux/bpf_crypto.h> +#include <crypto/skcipher.h> + +static void *bpf_crypto_lskcipher_alloc_tfm(const char *algo) +{ + return crypto_alloc_lskcipher(algo, 0, 0); +} + +static void bpf_crypto_lskcipher_free_tfm(void *tfm) +{ + crypto_free_lskcipher(tfm); +} + +static int bpf_crypto_lskcipher_has_algo(const char *algo) +{ + return crypto_has_skcipher(algo, CRYPTO_ALG_TYPE_LSKCIPHER, CRYPTO_ALG_TYPE_MASK); +} + +static int bpf_crypto_lskcipher_setkey(void *tfm, const u8 *key, unsigned int keylen) +{ + return crypto_lskcipher_setkey(tfm, key, keylen); +} + +static u32 bpf_crypto_lskcipher_get_flags(void *tfm) +{ + return crypto_lskcipher_get_flags(tfm); +} + +static unsigned int bpf_crypto_lskcipher_ivsize(void *tfm) +{ + return crypto_lskcipher_ivsize(tfm); +} + +static unsigned int bpf_crypto_lskcipher_statesize(void *tfm) +{ + return crypto_lskcipher_statesize(tfm); +} + +static int bpf_crypto_lskcipher_encrypt(void *tfm, const u8 *src, u8 *dst, + unsigned int len, u8 *siv) +{ + return crypto_lskcipher_encrypt(tfm, src, dst, len, siv); +} + +static int bpf_crypto_lskcipher_decrypt(void *tfm, const u8 *src, u8 *dst, + unsigned int len, u8 *siv) +{ + return crypto_lskcipher_decrypt(tfm, src, dst, len, siv); +} + +static const struct bpf_crypto_type bpf_crypto_lskcipher_type = { + .alloc_tfm = bpf_crypto_lskcipher_alloc_tfm, + .free_tfm = bpf_crypto_lskcipher_free_tfm, + .has_algo = bpf_crypto_lskcipher_has_algo, + .setkey = bpf_crypto_lskcipher_setkey, + .encrypt = bpf_crypto_lskcipher_encrypt, + .decrypt = bpf_crypto_lskcipher_decrypt, + .ivsize = bpf_crypto_lskcipher_ivsize, + .statesize = bpf_crypto_lskcipher_statesize, + .get_flags = bpf_crypto_lskcipher_get_flags, + .owner = THIS_MODULE, + .name = "skcipher", +}; + +static int __init bpf_crypto_skcipher_init(void) +{ + return bpf_crypto_register_type(&bpf_crypto_lskcipher_type); +} + +static void __exit bpf_crypto_skcipher_exit(void) +{ + int err = bpf_crypto_unregister_type(&bpf_crypto_lskcipher_type); + WARN_ON_ONCE(err); +} + +module_init(bpf_crypto_skcipher_init); +module_exit(bpf_crypto_skcipher_exit); +MODULE_LICENSE("GPL"); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e52d5b3ee45e..90094400cc63 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -184,8 +184,8 @@ struct bpf_map_ops { }; enum { - /* Support at most 10 fields in a BTF type */ - BTF_FIELDS_MAX = 10, + /* Support at most 11 fields in a BTF type */ + BTF_FIELDS_MAX = 11, }; enum btf_field_type { @@ -202,6 +202,7 @@ enum btf_field_type { BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), + BPF_WORKQUEUE = (1 << 10), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -238,6 +239,7 @@ struct btf_record { u32 field_mask; int spin_lock_off; int timer_off; + int wq_off; int refcount_off; struct btf_field fields[]; }; @@ -312,6 +314,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_spin_lock"; case BPF_TIMER: return "bpf_timer"; + case BPF_WORKQUEUE: + return "bpf_wq"; case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; @@ -340,6 +344,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_spin_lock); case BPF_TIMER: return sizeof(struct bpf_timer); + case BPF_WORKQUEUE: + return sizeof(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: @@ -367,6 +373,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); + case BPF_WORKQUEUE: + return __alignof__(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: @@ -406,6 +414,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: @@ -525,6 +534,7 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); +void bpf_wq_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -1265,6 +1275,7 @@ int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); +bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); @@ -2209,6 +2220,7 @@ void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); @@ -3010,6 +3022,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog); void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); @@ -3108,6 +3121,11 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ diff --git a/include/linux/bpf_crypto.h b/include/linux/bpf_crypto.h new file mode 100644 index 000000000000..a41e71d4e2d9 --- /dev/null +++ b/include/linux/bpf_crypto.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_CRYPTO_H +#define _BPF_CRYPTO_H + +struct bpf_crypto_type { + void *(*alloc_tfm)(const char *algo); + void (*free_tfm)(void *tfm); + int (*has_algo)(const char *algo); + int (*setkey)(void *tfm, const u8 *key, unsigned int keylen); + int (*setauthsize)(void *tfm, unsigned int authsize); + int (*encrypt)(void *tfm, const u8 *src, u8 *dst, unsigned int len, u8 *iv); + int (*decrypt)(void *tfm, const u8 *src, u8 *dst, unsigned int len, u8 *iv); + unsigned int (*ivsize)(void *tfm); + unsigned int (*statesize)(void *tfm); + u32 (*get_flags)(void *tfm); + struct module *owner; + char name[14]; +}; + +int bpf_crypto_register_type(const struct bpf_crypto_type *type); +int bpf_crypto_unregister_type(const struct bpf_crypto_type *type); + +#endif /* _BPF_CRYPTO_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7cb1b75eee38..50aa87f8d77f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -421,11 +421,13 @@ struct bpf_verifier_state { struct bpf_active_lock active_lock; bool speculative; bool active_rcu_lock; + u32 active_preempt_lock; /* If this state was ever pointed-to by other state's loop_entry field * this flag would be set to true. Used to avoid freeing such states * while they are still in use. */ bool used_as_loop_entry; + bool in_sleepable; /* first and last insn idx of this verifier state */ u32 first_insn_idx; @@ -502,6 +504,13 @@ struct bpf_loop_inline_state { u32 callback_subprogno; /* valid when fit_for_inline is true */ }; +/* pointer and state for maps */ +struct bpf_map_ptr_state { + struct bpf_map *map_ptr; + bool poison; + bool unpriv; +}; + /* Possible states for alu_state member. */ #define BPF_ALU_SANITIZE_SRC (1U << 0) #define BPF_ALU_SANITIZE_DST (1U << 1) @@ -514,7 +523,7 @@ struct bpf_loop_inline_state { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - unsigned long map_ptr_state; /* pointer/poison value for maps */ + struct bpf_map_ptr_state map_ptr_state; s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { diff --git a/include/linux/filter.h b/include/linux/filter.h index 44934b968b57..7a27f19bf44d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -75,6 +75,9 @@ struct ctl_table_header; /* unused opcode to mark special load instruction. Same as BPF_MSH */ #define BPF_PROBE_MEM32 0xa0 +/* unused opcode to mark special atomic instruction */ +#define BPF_PROBE_ATOMIC 0xe0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 @@ -178,6 +181,25 @@ struct ctl_table_header; .off = 0, \ .imm = 0 }) +/* Special (internal-only) form of mov, used to resolve per-CPU addrs: + * dst_reg = src_reg + <percpu_base_off> + * BPF_ADDR_PERCPU is used as a special insn->off value. + */ +#define BPF_ADDR_PERCPU (-1) + +#define BPF_MOV64_PERCPU_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = BPF_ADDR_PERCPU, \ + .imm = 0 }) + +static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; +} + /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ @@ -654,14 +676,16 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; - u64 start = sched_clock(); + u64 duration, start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + + duration = sched_clock() - start; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, sched_clock() - start); + u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); @@ -970,11 +994,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); +bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e65ec3fd2799..9c8dd4c01412 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,10 @@ struct sk_psock_progs { struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; + struct bpf_link *msg_parser_link; + struct bpf_link *stream_parser_link; + struct bpf_link *stream_verdict_link; + struct bpf_link *skb_verdict_link; }; enum sk_psock_state_bits { diff --git a/include/net/tcp.h b/include/net/tcp.h index a9eb21251195..fe98fb01879b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2711,10 +2711,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } -static inline void tcp_bpf_rtt(struct sock *sk) +static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) - tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h index 265447e3f71a..0c924d39b7cb 100644 --- a/include/trace/events/bpf_test_run.h +++ b/include/trace/events/bpf_test_run.h @@ -7,6 +7,23 @@ #include <linux/tracepoint.h> +TRACE_EVENT(bpf_trigger_tp, + + TP_PROTO(int nonce), + + TP_ARGS(nonce), + + TP_STRUCT__entry( + __field(int, nonce) + ), + + TP_fast_assign( + __entry->nonce = nonce; + ), + + TP_printk("nonce %d", __entry->nonce) +); + DECLARE_EVENT_CLASS(bpf_test_finish, TP_PROTO(int *err), diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9585f5345353..d94a72593ead 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1135,6 +1135,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -3394,6 +3395,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -5022,7 +5027,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) @@ -5508,7 +5513,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) @@ -6720,6 +6725,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); @@ -6938,6 +6947,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle @@ -7120,6 +7131,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7152,7 +7164,7 @@ struct bpf_fib_lookup { /* output: MTU value */ __u16 mtu_result; - }; + } __attribute__((packed, aligned(2))); /* input: L3 device index for lookup * output: device index from FIB lookup */ @@ -7197,8 +7209,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { @@ -7285,6 +7308,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e497011261b8..85786fd97d2a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,6 +44,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif +ifeq ($(CONFIG_CRYPTO),y) +obj-$(CONFIG_BPF_SYSCALL) += crypto.o +endif obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c8dd..6c81630c5293 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -37,7 +37,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 13358675ff2e..580d07b15471 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -246,6 +246,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */ +static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + + BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs)); + + *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0); + if (!map->bypass_spec_v1) { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5); + } + + *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0); + return insn - insn_buf; +} + static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -396,17 +428,21 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers(struct bpf_map *map) +static void array_map_free_timers_wq(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer on uref dropping to zero. */ - if (!btf_record_has_field(map->record, BPF_TIMER)) - return; + /* We don't reset or free fields other than timer and workqueue + * on uref dropping to zero. + */ + if (btf_record_has_field(map->record, BPF_TIMER)) + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -750,7 +786,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers, + .map_release_uref = array_map_free_timers_wq, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, @@ -776,6 +812,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, + .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index bdea1a459153..976cb258a0ed 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -318,7 +318,7 @@ static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, * * If the local_storage->list is already empty, the caller will not * care about the bpf_ma value also because the caller is not - * responsibile to free the local_storage. + * responsible to free the local_storage. */ if (storage_smap) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 90c4a32d89ff..8291fbfd27b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3464,6 +3464,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, goto end; } } + if (field_mask & BPF_WORKQUEUE) { + if (!strcmp(name, "bpf_wq")) { + if (*seen_mask & BPF_WORKQUEUE) + return -E2BIG; + *seen_mask |= BPF_WORKQUEUE; + type = BPF_WORKQUEUE; + goto end; + } + } field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); @@ -3515,6 +3524,7 @@ static int btf_find_struct_field(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3582,6 +3592,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3816,6 +3827,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; + rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); @@ -3846,6 +3858,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->timer_off = rec->fields[i].offset; break; + case BPF_WORKQUEUE: + WARN_ON_ONCE(rec->wq_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->wq_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ @@ -5642,8 +5659,8 @@ errout_free: return ERR_PTR(err); } -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) @@ -5971,6 +5988,9 @@ struct btf *btf_parse_vmlinux(void) struct btf *btf = NULL; int err; + if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) + return ERR_PTR(-ENOENT); + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5aacb1d3c4cc..95c7fd093e55 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -747,7 +747,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) @@ -813,7 +813,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; @@ -2218,6 +2218,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ @@ -2231,6 +2232,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ @@ -2812,7 +2814,7 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); -/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +/* RNG for unprivileged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) @@ -2943,6 +2945,11 @@ bool __weak bpf_jit_supports_subprog_tailcalls(void) return false; } +bool __weak bpf_jit_supports_percpu_insn(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; @@ -2958,6 +2965,11 @@ bool __weak bpf_jit_supports_arena(void) return false; } +bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + return false; +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index dad0fb1c8e87..33c473d676a5 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -474,6 +474,7 @@ static int __init cpumask_kfunc_init(void) ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, ARRAY_SIZE(cpumask_dtors), THIS_MODULE); diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c new file mode 100644 index 000000000000..2bee4af91e38 --- /dev/null +++ b/kernel/bpf/crypto.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta, Inc */ +#include <linux/bpf.h> +#include <linux/bpf_crypto.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/filter.h> +#include <linux/scatterlist.h> +#include <linux/skbuff.h> +#include <crypto/skcipher.h> + +struct bpf_crypto_type_list { + const struct bpf_crypto_type *type; + struct list_head list; +}; + +/* BPF crypto initialization parameters struct */ +/** + * struct bpf_crypto_params - BPF crypto initialization parameters structure + * @type: The string of crypto operation type. + * @reserved: Reserved member, will be reused for more options in future + * Values: + * 0 + * @algo: The string of algorithm to initialize. + * @key: The cipher key used to init crypto algorithm. + * @key_len: The length of cipher key. + * @authsize: The length of authentication tag used by algorithm. + */ +struct bpf_crypto_params { + char type[14]; + u8 reserved[2]; + char algo[128]; + u8 key[256]; + u32 key_len; + u32 authsize; +}; + +static LIST_HEAD(bpf_crypto_types); +static DECLARE_RWSEM(bpf_crypto_types_sem); + +/** + * struct bpf_crypto_ctx - refcounted BPF crypto context structure + * @type: The pointer to bpf crypto type + * @tfm: The pointer to instance of crypto API struct. + * @siv_len: Size of IV and state storage for cipher + * @rcu: The RCU head used to free the crypto context with RCU safety. + * @usage: Object reference counter. When the refcount goes to 0, the + * memory is released back to the BPF allocator, which provides + * RCU safety. + */ +struct bpf_crypto_ctx { + const struct bpf_crypto_type *type; + void *tfm; + u32 siv_len; + struct rcu_head rcu; + refcount_t usage; +}; + +int bpf_crypto_register_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -EEXIST; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (!strcmp(node->type->name, type->name)) + goto unlock; + } + + node = kmalloc(sizeof(*node), GFP_KERNEL); + err = -ENOMEM; + if (!node) + goto unlock; + + node->type = type; + list_add(&node->list, &bpf_crypto_types); + err = 0; + +unlock: + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_register_type); + +int bpf_crypto_unregister_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -ENOENT; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, type->name)) + continue; + + list_del(&node->list); + kfree(node); + err = 0; + break; + } + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type); + +static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name) +{ + const struct bpf_crypto_type *type = ERR_PTR(-ENOENT); + struct bpf_crypto_type_list *node; + + down_read(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, name)) + continue; + + if (try_module_get(node->type->owner)) + type = node->type; + break; + } + up_read(&bpf_crypto_types_sem); + + return type; +} + +__bpf_kfunc_start_defs(); + +/** + * bpf_crypto_ctx_create() - Create a mutable BPF crypto context. + * + * Allocates a crypto context that can be used, acquired, and released by + * a BPF program. The crypto context returned by this function must either + * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release(). + * As crypto API functions use GFP_KERNEL allocations, this function can + * only be used in sleepable BPF programs. + * + * bpf_crypto_ctx_create() allocates memory for crypto context. + * It may return NULL if no memory is available. + * @params: pointer to struct bpf_crypto_params which contains all the + * details needed to initialise crypto context. + * @params__sz: size of steuct bpf_crypto_params usef by bpf program + * @err: integer to store error code when NULL is returned. + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz, + int *err) +{ + const struct bpf_crypto_type *type; + struct bpf_crypto_ctx *ctx; + + if (!params || params->reserved[0] || params->reserved[1] || + params__sz != sizeof(struct bpf_crypto_params)) { + *err = -EINVAL; + return NULL; + } + + type = bpf_crypto_get_type(params->type); + if (IS_ERR(type)) { + *err = PTR_ERR(type); + return NULL; + } + + if (!type->has_algo(params->algo)) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!!params->authsize ^ !!type->setauthsize) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!params->key_len || params->key_len > sizeof(params->key)) { + *err = -EINVAL; + goto err_module_put; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + *err = -ENOMEM; + goto err_module_put; + } + + ctx->type = type; + ctx->tfm = type->alloc_tfm(params->algo); + if (IS_ERR(ctx->tfm)) { + *err = PTR_ERR(ctx->tfm); + goto err_free_ctx; + } + + if (params->authsize) { + *err = type->setauthsize(ctx->tfm, params->authsize); + if (*err) + goto err_free_tfm; + } + + *err = type->setkey(ctx->tfm, params->key, params->key_len); + if (*err) + goto err_free_tfm; + + if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) { + *err = -EINVAL; + goto err_free_tfm; + } + + ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm); + + refcount_set(&ctx->usage, 1); + + return ctx; + +err_free_tfm: + type->free_tfm(ctx->tfm); +err_free_ctx: + kfree(ctx); +err_module_put: + module_put(type->owner); + + return NULL; +} + +static void crypto_free_cb(struct rcu_head *head) +{ + struct bpf_crypto_ctx *ctx; + + ctx = container_of(head, struct bpf_crypto_ctx, rcu); + ctx->type->free_tfm(ctx->tfm); + module_put(ctx->type->owner); + kfree(ctx); +} + +/** + * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context. + * @ctx: The BPF crypto context being acquired. The ctx must be a trusted + * pointer. + * + * Acquires a reference to a BPF crypto context. The context returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_crypto_ctx_release(). + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) +{ + if (!refcount_inc_not_zero(&ctx->usage)) + return NULL; + return ctx; +} + +/** + * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context. + * @ctx: The crypto context being released. + * + * Releases a previously acquired reference to a BPF crypto context. When the final + * reference of the BPF crypto context has been released, its memory + * will be released. + */ +__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) +{ + if (refcount_dec_and_test(&ctx->usage)) + call_rcu(&ctx->rcu, crypto_free_cb); +} + +static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv, + bool decrypt) +{ + u32 src_len, dst_len, siv_len; + const u8 *psrc; + u8 *pdst, *piv; + int err; + + if (__bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + siv_len = __bpf_dynptr_size(siv); + src_len = __bpf_dynptr_size(src); + dst_len = __bpf_dynptr_size(dst); + if (!src_len || !dst_len) + return -EINVAL; + + if (siv_len != ctx->siv_len) + return -EINVAL; + + psrc = __bpf_dynptr_data(src, src_len); + if (!psrc) + return -EINVAL; + pdst = __bpf_dynptr_data_rw(dst, dst_len); + if (!pdst) + return -EINVAL; + + piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL; + if (siv_len && !piv) + return -EINVAL; + + err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) + : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); + + return err; +} + +/** + * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer. + * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, true); +} + +/** + * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the plain data. Must be a trusted pointer. + * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, false); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(crypt_init_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_KFUNCS_END(crypt_init_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_init_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_init_kfunc_btf_ids, +}; + +BTF_KFUNCS_START(crypt_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU) +BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU) +BTF_KFUNCS_END(crypt_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_kfunc_btf_ids, +}; + +BTF_ID_LIST(bpf_crypto_dtor_ids) +BTF_ID(struct, bpf_crypto_ctx) +BTF_ID(func, bpf_crypto_ctx_release) + +static int __init crypto_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = { + { + .btf_id = bpf_crypto_dtor_ids[0], + .kfunc_btf_id = bpf_crypto_dtor_ids[1] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &crypt_init_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors, + ARRAY_SIZE(bpf_crypto_dtors), + THIS_MODULE); +} + +late_initcall(crypto_kfunc_init); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bd2e2dd04740..309c4aa1b026 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn) insn->off == BPF_ADDR_SPACE_CAST; } +/* Special (internal-only) form of mov, used to resolve per-CPU addrs: + * dst_reg = src_reg + <percpu_base_off> + * BPF_ADDR_PERCPU is used as a special insn->off value. + */ +#define BPF_ADDR_PERCPU (-1) + +static inline bool is_mov_percpu_addr(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); + } else if (is_mov_percpu_addr(insn)) { + verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n", + insn->code, insn->dst_reg, insn->src_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3a088a5349bc..0179183c543a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -240,6 +240,26 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_wq(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -1490,11 +1510,12 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } + cond_resched(); } migrate_enable(); } -static void htab_free_malloced_timers(struct bpf_htab *htab) +static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) { int i; @@ -1506,24 +1527,35 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); + if (is_timer) + bpf_obj_free_timer(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); + else + bpf_obj_free_workqueue(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers(struct bpf_map *map) +static void htab_map_free_timers_and_wq(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer on uref dropping to zero */ - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers(htab); - else - htab_free_prealloced_timers(htab); + /* We only free timer and workqueue on uref dropping to zero */ + if (btf_record_has_field(htab->map.record, BPF_TIMER)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, true); + else + htab_free_prealloced_timers(htab); + } + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, false); + else + htab_free_prealloced_wq(htab); + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -1538,7 +1570,7 @@ static void htab_map_free(struct bpf_map *map) */ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it - * underneath and is reponsible for waiting for callbacks to finish + * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy(). */ if (!htab_is_prealloc(htab)) { @@ -2259,7 +2291,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2280,7 +2312,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, @@ -2307,6 +2339,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call for per-CPU hashmap */ +static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, + offsetof(struct htab_elem, key) + map->key_size); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + + return insn - insn_buf; +} + static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; @@ -2435,6 +2487,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_gen_lookup = htab_percpu_map_gen_lookup, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f860adab3eb9..2a69a9a36c0f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1079,11 +1079,20 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +struct bpf_async_cb { + struct bpf_map *map; + struct bpf_prog *prog; + void __rcu *callback_fn; + void *value; + struct rcu_head rcu; + u64 flags; +}; + /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. - * That space is used to keep 'struct bpf_timer_kern'. + * That space is used to keep 'struct bpf_async_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. @@ -1096,17 +1105,23 @@ const struct bpf_func_proto bpf_snprintf_proto = { * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { + struct bpf_async_cb cb; struct hrtimer timer; - struct bpf_map *map; - struct bpf_prog *prog; - void __rcu *callback_fn; - void *value; - struct rcu_head rcu; }; -/* the actual struct hidden inside uapi struct bpf_timer */ -struct bpf_timer_kern { - struct bpf_hrtimer *timer; +struct bpf_work { + struct bpf_async_cb cb; + struct work_struct work; + struct work_struct delete_work; +}; + +/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ +struct bpf_async_kern { + union { + struct bpf_async_cb *cb; + struct bpf_hrtimer *timer; + struct bpf_work *work; + }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. @@ -1114,19 +1129,24 @@ struct bpf_timer_kern { struct bpf_spin_lock lock; } __attribute__((aligned(8))); +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, +}; + static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); - struct bpf_map *map = t->map; - void *value = t->value; + struct bpf_map *map = t->cb.map; + void *value = t->cb.value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); - callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); + callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; @@ -1155,46 +1175,112 @@ out: return HRTIMER_NORESTART; } -BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, - u64, flags) +static void bpf_wq_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, work); + struct bpf_async_cb *cb = &w->cb; + struct bpf_map *map = cb->map; + bpf_callback_t callback_fn; + void *value = cb->value; + void *key; + u32 idx; + + BTF_TYPE_EMIT(struct bpf_wq); + + callback_fn = READ_ONCE(cb->callback_fn); + if (!callback_fn) + return; + + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* compute the key */ + idx = ((char *)value - array->value) / array->elem_size; + key = &idx; + } else { /* hash or lru */ + key = value - round_up(map->key_size, 8); + } + + rcu_read_lock_trace(); + migrate_disable(); + + callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); + + migrate_enable(); + rcu_read_unlock_trace(); +} + +static void bpf_wq_delete_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, delete_work); + + cancel_work_sync(&w->work); + + kfree_rcu(w, cb.rcu); +} + +static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, + enum bpf_async_type type) { - clockid_t clockid = flags & (MAX_CLOCKS - 1); + struct bpf_async_cb *cb; struct bpf_hrtimer *t; + struct bpf_work *w; + clockid_t clockid; + size_t size; int ret = 0; - BUILD_BUG_ON(MAX_CLOCKS != 16); - BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); - BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); - if (in_nmi()) return -EOPNOTSUPP; - if (flags >= MAX_CLOCKS || - /* similar to timerfd except _ALARM variants are not supported */ - (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME && - clockid != CLOCK_BOOTTIME)) + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + size = sizeof(struct bpf_hrtimer); + break; + case BPF_ASYNC_TYPE_WQ: + size = sizeof(struct bpf_work); + break; + default: return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; + } + + __bpf_spin_lock_irqsave(&async->lock); + t = async->timer; if (t) { ret = -EBUSY; goto out; } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ - t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); - if (!t) { + cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + if (!cb) { ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->record->timer_off; - t->map = map; - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; - WRITE_ONCE(timer->timer, t); - /* Guarantee the order between timer->timer and map->usercnt. So + + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + clockid = flags & (MAX_CLOCKS - 1); + t = (struct bpf_hrtimer *)cb; + + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; + cb->value = (void *)async - map->record->timer_off; + break; + case BPF_ASYNC_TYPE_WQ: + w = (struct bpf_work *)cb; + + INIT_WORK(&w->work, bpf_wq_work); + INIT_WORK(&w->delete_work, bpf_wq_delete_work); + cb->value = (void *)async - map->record->wq_off; + break; + } + cb->map = map; + cb->prog = NULL; + cb->flags = flags; + rcu_assign_pointer(cb->callback_fn, NULL); + + WRITE_ONCE(async->cb, cb); + /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. @@ -1204,15 +1290,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(timer->timer, NULL); - kfree(t); + WRITE_ONCE(async->cb, NULL); + kfree(cb); ret = -EPERM; } out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, + u64, flags) +{ + clock_t clockid = flags & (MAX_CLOCKS - 1); + + BUILD_BUG_ON(MAX_CLOCKS != 16); + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); + + if (flags >= MAX_CLOCKS || + /* similar to timerfd except _ALARM variants are not supported */ + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_BOOTTIME)) + return -EINVAL; + + return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, @@ -1222,22 +1327,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, - struct bpf_prog_aux *, aux) +static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, + struct bpf_prog_aux *aux, unsigned int flags, + enum bpf_async_type type) { struct bpf_prog *prev, *prog = aux->prog; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { + __bpf_spin_lock_irqsave(&async->lock); + cb = async->cb; + if (!cb) { ret = -EINVAL; goto out; } - if (!atomic64_read(&t->map->usercnt)) { + if (!atomic64_read(&cb->map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space @@ -1246,7 +1352,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb ret = -EPERM; goto out; } - prev = t->prog; + prev = cb->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. @@ -1259,14 +1365,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - t->prog = prog; + cb->prog = prog; } - rcu_assign_pointer(t->callback_fn, callback_fn); + rcu_assign_pointer(cb->callback_fn, callback_fn); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, + struct bpf_prog_aux *, aux) +{ + return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, @@ -1275,7 +1387,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; @@ -1287,7 +1399,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; - if (!t || !t->prog) { + if (!t || !t->cb.prog) { ret = -EINVAL; goto out; } @@ -1315,18 +1427,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_hrtimer *t) +static void drop_prog_refcnt(struct bpf_async_cb *async) { - struct bpf_prog *prog = t->prog; + struct bpf_prog *prog = async->prog; if (prog) { bpf_prog_put(prog); - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); + async->prog = NULL; + rcu_assign_pointer(async->callback_fn, NULL); } } -BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t; int ret = 0; @@ -1348,7 +1460,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) ret = -EDEADLK; goto out; } - drop_prog_refcnt(t); + drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish @@ -1366,36 +1478,44 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) { - struct bpf_timer_kern *timer = val; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; - /* Performance optimization: read timer->timer without lock first. */ - if (!READ_ONCE(timer->timer)) - return; + /* Performance optimization: read async->cb without lock first. */ + if (!READ_ONCE(async->cb)) + return NULL; - __bpf_spin_lock_irqsave(&timer->lock); + __bpf_spin_lock_irqsave(&async->lock); /* re-read it under lock */ - t = timer->timer; - if (!t) + cb = async->cb; + if (!cb) goto out; - drop_prog_refcnt(t); + drop_prog_refcnt(cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ - WRITE_ONCE(timer->timer, NULL); + WRITE_ONCE(async->cb, NULL); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); + return cb; +} + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_hrtimer *t; + + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!t) return; /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call kfree(t) * right after for both preallocated and non-preallocated maps. - * The timer->timer = NULL was already done and no code path can + * The async->cb = NULL was already done and no code path can * see address 't' anymore. * * Check that bpf_map_delete/update_elem() wasn't called from timer @@ -1404,13 +1524,33 @@ out: * return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', - * since timer->timer = NULL was already done. The timer will be + * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); - kfree_rcu(t, rcu); + kfree_rcu(t, cb.rcu); +} + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_wq_cancel_and_free(void *val) +{ + struct bpf_work *work; + + BTF_TYPE_EMIT(struct bpf_wq); + + work = (struct bpf_work *)__bpf_async_cancel_and_free(val); + if (!work) + return; + /* Trigger cancel of the sleepable work, but *do not* wait for + * it to finish if it was running as we might not be in a + * sleepable context. + * kfree will be called once the work has finished. + */ + schedule_work(&work->delete_work); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) @@ -1443,7 +1583,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) +bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -2412,7 +2552,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer - * if the bpf program allows skb data writes. There are two possiblities + * if the bpf program allows skb data writes. There are two possibilities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the @@ -2549,6 +2689,61 @@ __bpf_kfunc void bpf_throw(u64 cookie) WARN(1, "A call to BPF exception callback should never return\n"); } +__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_map *map = p__map; + + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq)); + + if (flags) + return -EINVAL; + + return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); +} + +__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_work *w; + + if (in_nmi()) + return -EOPNOTSUPP; + if (flags) + return -EINVAL; + w = READ_ONCE(async->work); + if (!w || !READ_ONCE(w->cb.prog)) + return -EINVAL; + + schedule_work(&w->work); + return 0; +} + +__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags, + void *aux__ign) +{ + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + + if (flags) + return -EINVAL; + + return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); +} + +__bpf_kfunc void bpf_preempt_disable(void) +{ + preempt_disable(); +} + +__bpf_kfunc void bpf_preempt_enable(void) +{ + preempt_enable(); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2625,6 +2820,12 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) +BTF_ID_FLAGS(func, bpf_wq_init) +BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_start) +BTF_ID_FLAGS(func, bpf_preempt_disable) +BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -2652,6 +2853,7 @@ static int __init kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 2a243cf37c60..4bd8f17a9f24 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -467,9 +467,9 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) if (type & PTR_MAYBE_NULL) { if (base_type(type) == PTR_TO_BTF_ID) - strncpy(postfix, "or_null_", 16); + strscpy(postfix, "or_null_"); else - strncpy(postfix, "_or_null", 16); + strscpy(postfix, "_or_null"); } snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 939620b91c0e..0218a5132ab5 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -316,6 +316,7 @@ static long trie_update_elem(struct bpf_map *map, { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; @@ -390,7 +391,7 @@ static long trie_update_elem(struct bpf_map *map, trie->n_entries--; rcu_assign_pointer(*slot, new_node); - kfree_rcu(node, rcu); + free_node = node; goto out; } @@ -437,6 +438,7 @@ out: } spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_node, rcu); return ret; } @@ -445,6 +447,7 @@ out: static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *free_node = NULL, *free_parent = NULL; struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; @@ -514,8 +517,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) else rcu_assign_pointer( *trim2, rcu_access_pointer(parent->child[0])); - kfree_rcu(parent, rcu); - kfree_rcu(node, rcu); + free_parent = parent; + free_node = node; goto out; } @@ -529,10 +532,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); else RCU_INIT_POINTER(*trim, NULL); - kfree_rcu(node, rcu); + free_node = node; out: spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_parent, rcu); + kfree_rcu(free_node, rcu); return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c0f2f052a02c..f655adf42e39 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -559,6 +559,7 @@ void btf_record_free(struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to release */ break; default: @@ -608,6 +609,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to acquire */ break; default: @@ -659,6 +661,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) + return; + bpf_wq_cancel_and_free(obj + rec->wq_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -679,6 +688,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; + case BPF_WORKQUEUE: + bpf_wq_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1085,7 +1097,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1115,6 +1127,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, } break; case BPF_TIMER: + case BPF_WORKQUEUE: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { @@ -5242,6 +5255,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: + ret = sock_map_link_create(attr, prog); + break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index ef6911aee3bb..fedb54c94cdb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,8 +9,8 @@ #include <linux/sysfs.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, @@ -32,7 +32,7 @@ static int __init btf_vmlinux_init(void) { bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) + if (bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index cc50607f8d8c..26ae703d3c3b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -885,12 +885,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog, * Hence check that 'start' is valid. */ start > NO_START_TIME) { + u64 duration = sched_clock() - start; unsigned long flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, sched_clock() - start); + u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36f5a9455205..87ff414899cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -172,7 +172,7 @@ static bool bpf_global_percpu_ma_set; /* verifier_state + insn_idx are pushed to stack when branch is encountered */ struct bpf_verifier_stack_elem { - /* verifer state is 'st' + /* verifier state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ @@ -190,11 +190,6 @@ struct bpf_verifier_stack_elem { #define BPF_MAP_KEY_POISON (1ULL << 63) #define BPF_MAP_KEY_SEEN (1ULL << 62) -#define BPF_MAP_PTR_UNPRIV 1UL -#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ - POISON_POINTER_DELTA)) -#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) - #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); @@ -209,21 +204,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; + return aux->map_ptr_state.poison; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state.unpriv; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, - const struct bpf_map *map, bool unpriv) + struct bpf_map *map, + bool unpriv, bool poison) { - BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_ptr_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state.unpriv = unpriv; + aux->map_ptr_state.poison = poison; + aux->map_ptr_state.map_ptr = map; } static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) @@ -336,6 +332,10 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; + struct { + struct bpf_map *ptr; + int uid; + } map; u64 mem_size; }; @@ -501,8 +501,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) } static bool is_sync_callback_calling_kfunc(u32 btf_id); +static bool is_async_callback_calling_kfunc(u32 btf_id); +static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); + static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || @@ -530,7 +534,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn) static bool is_async_callback_calling_insn(struct bpf_insn *insn) { - return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm); + return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } static bool is_may_goto_insn(struct bpf_insn *insn) @@ -1429,6 +1434,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->active_rcu_lock = src->active_rcu_lock; + dst_state->active_preempt_lock = src->active_preempt_lock; + dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->active_lock.ptr = src->active_lock.ptr; dst_state->active_lock.id = src->active_lock.id; @@ -1842,6 +1849,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) */ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; + if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -2135,7 +2144,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) { /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tigher range. + * values on both sides of 64-bit range in hope to have tighter range. * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound @@ -2143,7 +2152,7 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respecitve s64 or u64 domain, just + * with are well-formed ranges in respective s64 or u64 domain, just * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. */ __u64 new_umin, new_umax; @@ -2402,7 +2411,7 @@ static void init_func_state(struct bpf_verifier_env *env, /* Similar to push_stack(), but for async callbacks */ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, - int subprog) + int subprog, bool is_sleepable) { struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; @@ -2429,6 +2438,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * Initialize it similar to do_check_common(). */ elem->st.branches = 1; + elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -3615,7 +3625,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * sreg needs precision before this insn */ bt_clear_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } else { /* dreg = K * dreg needs precision after this insn. @@ -3631,7 +3642,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * both dreg and sreg need precision * before this insn */ - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } /* else dreg += K * dreg still needs precision before this insn */ @@ -5274,7 +5286,8 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable; + return env->prog->sleepable || + (env->cur_state && env->cur_state->in_sleepable); } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -5297,6 +5310,7 @@ BTF_ID(struct, cgroup) BTF_ID(struct, bpf_cpumask) #endif BTF_ID(struct, task_struct) +BTF_ID(struct, bpf_crypto_ctx) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) @@ -6972,6 +6986,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, + bool allow_trust_mismatch); + static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { int load_reg; @@ -7032,7 +7049,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg) || - is_arena_reg(env, insn->dst_reg)) { + (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7068,6 +7085,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; + if (is_arena_reg(env, insn->dst_reg)) { + err = save_aux_ptr_type(env, PTR_TO_ARENA, false); + if (err) + return err; + } /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); @@ -7590,6 +7612,23 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_wq_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (map->record->wq_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", + val + reg->off, map->record->wq_off); + return -EINVAL; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -9484,7 +9523,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins */ env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && - !is_sync_callback_calling_kfunc(insn->imm)) { + !is_callback_calling_kfunc(insn->imm)) { verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", func_id_name(insn->imm), insn->imm); return -EFAULT; @@ -9498,10 +9537,11 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins if (is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; - /* there is no real recursion here. timer callbacks are async */ + /* there is no real recursion here. timer and workqueue callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - insn_idx, subprog); + insn_idx, subprog, + is_bpf_wq_set_callback_impl_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -9561,6 +9601,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + /* Only global subprogs cannot be called with preemption disabled. */ + if (env->cur_state->active_preempt_lock) { + verbose(env, "global function calls are not allowed with preemption disabled,\n" + "use static function instead\n"); + return -EINVAL; + } + if (err) { verbose(env, "Caller passes invalid args into func#%d ('%s')\n", subprog, sub_name); @@ -9653,12 +9700,8 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_map *map; int err; - if (bpf_map_ptr_poisoned(insn_aux)) { - verbose(env, "tail_call abusing map_ptr\n"); - return -EINVAL; - } - - map = BPF_MAP_PTR(insn_aux->map_ptr_state); + /* valid map_ptr and poison value does not matter */ + map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args || !map->ops->map_for_each_callback) { verbose(env, "callback function not allowed for map\n"); @@ -10017,12 +10060,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_ptr_state)) + if (!aux->map_ptr_state.map_ptr) + bpf_map_ptr_store(aux, meta->map_ptr, + !meta->map_ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map_ptr) bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1); - else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) - bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - !meta->map_ptr->bypass_spec_v1); + !meta->map_ptr->bypass_spec_v1, true); return 0; } @@ -10201,8 +10244,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { - verbose(env, "unknown func %s#%d\n", func_id_name(func_id), - func_id); + verbose(env, "program of this type cannot use helper %s#%d\n", + func_id_name(func_id), func_id); return -EINVAL; } @@ -10251,6 +10294,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + if (env->cur_state->active_preempt_lock) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in non-preemptible region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (in_sleepable(env) && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -10839,6 +10893,7 @@ enum { KF_ARG_LIST_NODE_ID, KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, + KF_ARG_WORKQUEUE_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -10847,6 +10902,7 @@ BTF_ID(struct, bpf_list_head) BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) +BTF_ID(struct, bpf_wq) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -10890,6 +10946,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -10959,6 +11020,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_WORKQUEUE, }; enum special_kfunc_type { @@ -10984,6 +11046,9 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, + KF_bpf_wq_set_callback_impl, + KF_bpf_preempt_disable, + KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, }; @@ -11008,6 +11073,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #endif @@ -11036,6 +11102,9 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_preempt_disable) +BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #else @@ -11062,6 +11131,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock]; } +static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable]; +} + +static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -11115,6 +11194,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_map(meta->btf, &args[argno])) return KF_ARG_PTR_TO_MAP; + if (is_kfunc_arg_wq(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_WORKQUEUE; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11366,12 +11448,28 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } +static bool is_async_callback_calling_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + static bool is_bpf_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; } +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + +static bool is_callback_calling_kfunc(u32 btf_id) +{ + return is_sync_callback_calling_kfunc(btf_id) || + is_async_callback_calling_kfunc(btf_id); +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id) { return is_bpf_rbtree_api_kfunc(btf_id); @@ -11716,6 +11814,34 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_NULL: continue; case KF_ARG_PTR_TO_MAP: + if (!reg->map_ptr) { + verbose(env, "pointer in R%d isn't map pointer\n", regno); + return -EINVAL; + } + if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + /* Use map_uid (which is unique id of inner map) to reject: + * inner_map1 = bpf_map_lookup_elem(outer_map, key1) + * inner_map2 = bpf_map_lookup_elem(outer_map, key2) + * if (inner_map1 && inner_map2) { + * wq = bpf_map_lookup_elem(inner_map1); + * if (wq) + * // mismatch would have been allowed + * bpf_wq_init(wq, inner_map2); + * } + * + * Comparing map_ptr is enough to distinguish normal and outer maps. + */ + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { + verbose(env, + "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } + } + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; + fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) @@ -11748,6 +11874,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: + case KF_ARG_PTR_TO_WORKQUEUE: /* Trusted by default */ break; default: @@ -12034,6 +12161,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret) return ret; break; + case KF_ARG_PTR_TO_WORKQUEUE: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_wq_func(env, regno, meta); + if (ret < 0) + return ret; + break; } } @@ -12093,11 +12229,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { - const struct btf_type *t, *ptr_type; + bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; u32 i, nargs, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; - bool sleepable, rcu_lock, rcu_unlock; + const struct btf_type *t, *ptr_type; struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; @@ -12145,9 +12281,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); + preempt_disable = is_kfunc_bpf_preempt_disable(&meta); + preempt_enable = is_kfunc_bpf_preempt_enable(&meta); + if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; struct bpf_reg_state *reg; @@ -12180,6 +12329,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + env->cur_state->active_preempt_lock--; + } else if (sleepable) { + verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); + return -EACCES; + } + } else if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); + return -EINVAL; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -13318,7 +13483,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umax_val = src_reg->u32_max_value; if (src_known && dst_known) { @@ -13331,18 +13495,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13351,7 +13513,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umax_val = src_reg->umax_value; if (src_known && dst_known) { @@ -13364,18 +13525,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = min(dst_reg->umax_value, umax_val); - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13387,7 +13546,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umin_val = src_reg->u32_min_value; if (src_known && dst_known) { @@ -13400,18 +13558,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13420,7 +13576,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umin_val = src_reg->umin_value; if (src_known && dst_known) { @@ -13433,18 +13588,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = max(dst_reg->umin_value, umin_val); dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13456,7 +13609,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -13467,10 +13619,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u32 result into s32. - */ + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; } else { @@ -13484,7 +13636,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; if (src_known && dst_known) { /* dst_reg->var_off.value has been updated earlier */ @@ -13496,10 +13647,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u64 result into s64. - */ + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; } else { @@ -14726,7 +14877,7 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state /* Adjusts the register min/max values in the case that the dst_reg and * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K - * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * check, in which case we have a fake SCALAR_VALUE representing insn->imm). * Technically we can do similar adjustments for pointers to the same object, * but we don't support that right now. */ @@ -15341,6 +15492,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -16908,6 +17064,12 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_rcu_lock != cur->active_rcu_lock) return false; + if (old->active_preempt_lock != cur->active_preempt_lock) + return false; + + if (old->in_sleepable != cur->in_sleepable) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -17364,7 +17526,7 @@ hit: err = propagate_liveness(env, &sl->state, cur); /* if previous state reached the exit with precision and - * current state is equivalent to it (except precsion marks) + * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ @@ -17542,7 +17704,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) } static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_missmatch) + bool allow_trust_mismatch) { enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; @@ -17560,7 +17722,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * src_reg == stack|map in some other branch. * Reject it. */ - if (allow_trust_missmatch && + if (allow_trust_mismatch && base_type(type) == PTR_TO_BTF_ID && base_type(*prev_type) == PTR_TO_BTF_ID) { /* @@ -17856,6 +18018,13 @@ process_bpf_exit_full: return -EINVAL; } + if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) { + verbose(env, "%d bpf_preempt_enable%s missing\n", + env->cur_state->active_preempt_lock, + env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are"); + return -EINVAL; + } + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback @@ -18153,6 +18322,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_wq yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); @@ -18348,6 +18524,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } if (env->used_map_cnt >= MAX_USED_MAPS) { + verbose(env, "The total number of maps per program has reached the limit of %u\n", + MAX_USED_MAPS); fdput(f); return -E2BIG; } @@ -18962,6 +19140,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && + env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { + insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; } else { continue; } @@ -19148,12 +19332,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) env->insn_aux_data[i].call_imm = insn->imm; /* point imm to __bpf_call_base+1 from JITs point of view */ insn->imm = 1; - if (bpf_pseudo_func(insn)) + if (bpf_pseudo_func(insn)) { +#if defined(MODULES_VADDR) + u64 addr = MODULES_VADDR; +#else + u64 addr = VMALLOC_START; +#endif /* jit (e.g. x86_64) may emit fewer instructions * if it learns a u32 imm is the same as a u64 imm. - * Force a non zero here. + * Set close enough to possible prog address. */ - insn[1].imm = 1; + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + } } err = bpf_prog_alloc_jited_linfo(prog); @@ -19226,6 +19417,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) BPF_CLASS(insn->code) == BPF_ST) && BPF_MODE(insn->code) == BPF_PROBE_MEM32) num_exentries++; + if (BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; @@ -19557,6 +19751,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + *cnt = 3; } return 0; } @@ -19832,7 +20033,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) !bpf_map_ptr_unpriv(aux)) { struct bpf_jit_poke_descriptor desc = { .reason = BPF_POKE_REASON_TAIL_CALL, - .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.map = aux->map_ptr_state.map_ptr, .tail_call.key = bpf_map_key_immediate(aux), .insn_idx = i + delta, }; @@ -19861,7 +20062,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -19969,7 +20170,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { @@ -20075,6 +20276,30 @@ patch_map_ops_generic: goto next_insn; } +#ifdef CONFIG_X86_64 + /* Implement bpf_get_smp_processor_id() inline. */ + if (insn->imm == BPF_FUNC_get_smp_processor_id && + prog->jit_requested && bpf_jit_supports_percpu_insn()) { + /* BPF_FUNC_get_smp_processor_id inlining is an + * optimization, so if pcpu_hot.cpu_number is ever + * changed in some incompatible and hard to support + * way, it's fine to back out this inlining logic + */ + insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } +#endif /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg) { @@ -20158,6 +20383,62 @@ patch_map_ops_generic: goto next_insn; } + /* Implement bpf_get_branch_snapshot inline. */ + if (IS_ENABLED(CONFIG_PERF_EVENTS) && + prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_get_branch_snapshot) { + /* We are dealing with the following func protos: + * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); + * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt); + */ + const u32 br_entry_size = sizeof(struct perf_branch_entry); + + /* struct perf_branch_entry is part of UAPI and is + * used as an array element, so extremely unlikely to + * ever grow or shrink + */ + BUILD_BUG_ON(br_entry_size != 24); + + /* if (unlikely(flags)) return -EINVAL */ + insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7); + + /* Transform size (bytes) into number of entries (cnt = size / 24). + * But to avoid expensive division instruction, we implement + * divide-by-3 through multiplication, followed by further + * division by 8 through 3-bit right shift. + * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., + * p. 227, chapter "Unsigned Division by 3" for details and proofs. + * + * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. + */ + insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab); + insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0); + insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36); + + /* call perf_snapshot_branch_stack implementation */ + insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack)); + /* if (entry_cnt == 0) return -ENOENT */ + insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4); + /* return entry_cnt * sizeof(struct perf_branch_entry) */ + insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size); + insn_buf[7] = BPF_JMP_A(3); + /* return -EINVAL; */ + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + insn_buf[9] = BPF_JMP_A(1); + /* return -ENOENT; */ + insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT); + cnt = 11; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + /* Implement bpf_kptr_xchg inline */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_kptr_xchg && diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 802e4f77a118..0ba722b57af3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1188,9 +1188,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { -#ifndef CONFIG_X86 - return -ENOENT; -#else static const u32 br_entry_size = sizeof(struct perf_branch_entry); u32 entry_cnt = size / br_entry_size; @@ -1203,7 +1200,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return -ENOENT; return entry_cnt * br_entry_size; -#endif } static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 569e6d2dc55c..207ff87194db 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -13431,7 +13431,7 @@ static struct bpf_test tests[] = { .stack_depth = 8, .nr_testruns = NR_PATTERN_RUNS, }, - /* 64-bit atomic magnitudes */ + /* 32-bit atomic magnitudes */ { "ATOMIC_W_ADD: all operand magnitudes", { }, diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 25b75844891a..891cdf61c65a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -79,6 +79,51 @@ static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) args->args[3], args->args[4]); } +static const struct bpf_ctx_arg_aux *find_ctx_arg_info(struct bpf_prog_aux *aux, int offset) +{ + int i; + + for (i = 0; i < aux->ctx_arg_info_size; i++) + if (aux->ctx_arg_info[i].offset == offset) + return &aux->ctx_arg_info[i]; + + return NULL; +} + +/* There is only one check at the moment: + * - zero should not be passed for pointer parameters not marked as nullable. + */ +static int check_test_run_args(struct bpf_prog *prog, struct bpf_dummy_ops_test_args *args) +{ + const struct btf_type *func_proto = prog->aux->attach_func_proto; + + for (u32 arg_no = 0; arg_no < btf_type_vlen(func_proto) ; ++arg_no) { + const struct btf_param *param = &btf_params(func_proto)[arg_no]; + const struct bpf_ctx_arg_aux *info; + const struct btf_type *t; + int offset; + + if (args->args[arg_no] != 0) + continue; + + /* Program is validated already, so there is no need + * to check if t is NULL. + */ + t = btf_type_skip_modifiers(bpf_dummy_ops_btf, param->type, NULL); + if (!btf_type_is_ptr(t)) + continue; + + offset = btf_ctx_arg_offset(bpf_dummy_ops_btf, func_proto, arg_no); + info = find_ctx_arg_info(prog->aux, offset); + if (info && (info->reg_type & PTR_MAYBE_NULL)) + continue; + + return -EINVAL; + } + + return 0; +} + extern const struct bpf_link_ops bpf_struct_ops_link_lops; int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, @@ -87,7 +132,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; const struct btf_type *func_proto; struct bpf_dummy_ops_test_args *args; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_links *tlinks = NULL; struct bpf_tramp_link *link = NULL; void *image = NULL; unsigned int op_idx; @@ -109,6 +154,10 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(args)) return PTR_ERR(args); + err = check_test_run_args(prog, args); + if (err) + goto out; + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); if (!tlinks) { err = -ENOMEM; @@ -232,7 +281,7 @@ static void bpf_dummy_unreg(void *kdata) { } -static int bpf_dummy_test_1(struct bpf_dummy_ops_state *cb) +static int bpf_dummy_ops__test_1(struct bpf_dummy_ops_state *cb__nullable) { return 0; } @@ -249,7 +298,7 @@ static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb) } static struct bpf_dummy_ops __bpf_bpf_dummy_ops = { - .test_1 = bpf_dummy_test_1, + .test_1 = bpf_dummy_ops__test_1, .test_2 = bpf_dummy_test_2, .test_sleepable = bpf_dummy_test_sleepable, }; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 61efeadaff8d..f6aad4ed2ab2 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -575,6 +575,13 @@ __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, return a + *b + c + d + (long)e + f + g; } +__bpf_kfunc int bpf_modify_return_test_tp(int nonce) +{ + trace_bpf_trigger_tp(nonce); + + return nonce; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -622,6 +629,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_test_modify_return_ids) diff --git a/net/core/filter.c b/net/core/filter.c index 5662464e1abd..6d319c76188b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,6 +87,9 @@ #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -5886,7 +5889,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -6029,7 +6035,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6107,7 +6116,7 @@ set_fwd_params: #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ - BPF_FIB_LOOKUP_SRC) + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8598466a3805..9402889840bf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); fdput(f); return ret; } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); put_map: @@ -1460,55 +1472,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1533,7 +1574,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1663,6 +1704,196 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; + enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + sockmap_link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + sockmap_link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = sockmap_link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + sockmap_link->map = map; + sockmap_link->attach_type = attach_type; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 05dc2d05bc7c..7e52ab24e40a 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1156,8 +1156,6 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { }; BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, bbr_init) BTF_ID_FLAGS(func, bbr_main) BTF_ID_FLAGS(func, bbr_sndbuf_expand) @@ -1166,8 +1164,6 @@ BTF_ID_FLAGS(func, bbr_cwnd_event) BTF_ID_FLAGS(func, bbr_ssthresh) BTF_ID_FLAGS(func, bbr_min_tso_segs) BTF_ID_FLAGS(func, bbr_set_state) -#endif -#endif BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 44869ea089e3..5dbed91c6178 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -486,16 +486,12 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { }; BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, cubictcp_init) BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) BTF_ID_FLAGS(func, cubictcp_cong_avoid) BTF_ID_FLAGS(func, cubictcp_state) BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) -#endif -#endif BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index e33fbe4933e4..6b712a33d49f 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -261,16 +261,12 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { }; BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, dctcp_init) BTF_ID_FLAGS(func, dctcp_update_alpha) BTF_ID_FLAGS(func, dctcp_cwnd_event) BTF_ID_FLAGS(func, dctcp_ssthresh) BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) -#endif -#endif BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 384fa5e2f065..53e1150f706f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -913,7 +913,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ @@ -923,7 +923,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index ac8487dcff1d..4315652678b9 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -31,9 +31,9 @@ see_also = $(subst " ",, \ "\n" \ "SEE ALSO\n" \ "========\n" \ - "\t**bpf**\ (2),\n" \ - "\t**bpf-helpers**\\ (7)" \ - $(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \ + "**bpf**\ (2),\n" \ + "**bpf-helpers**\\ (7)" \ + $(foreach page,$(call list_pages,$(1)),",\n**$(page)**\\ (8)") \ "\n") $(OUTPUT)%.8: %.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 342716f74ec4..eaba24320fb2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -14,82 +14,76 @@ tool for inspection of BTF data SYNOPSIS ======== - **bpftool** [*OPTIONS*] **btf** *COMMAND* +**bpftool** [*OPTIONS*] **btf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } - *COMMANDS* := { **dump** | **help** } +*COMMANDS* := { **dump** | **help** } BTF COMMANDS ============= -| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] -| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] -| **bpftool** **btf help** +| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] +| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] +| **bpftool** **btf help** | -| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } -| *FORMAT* := { **raw** | **c** } -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } +| *FORMAT* := { **raw** | **c** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } DESCRIPTION =========== - **bpftool btf { show | list }** [**id** *BTF_ID*] - Show information about loaded BTF objects. If a BTF ID is - specified, show information only about given BTF object, - otherwise list all BTF objects currently loaded on the - system. +bpftool btf { show | list } [id *BTF_ID*] + Show information about loaded BTF objects. If a BTF ID is specified, show + information only about given BTF object, otherwise list all BTF objects + currently loaded on the system. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BTF - objects. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BTF objects. On such kernels + bpftool will automatically emit this information as well. - **bpftool btf dump** *BTF_SRC* - Dump BTF entries from a given *BTF_SRC*. +bpftool btf dump *BTF_SRC* + Dump BTF entries from a given *BTF_SRC*. - When **id** is specified, BTF object with that ID will be - loaded and all its BTF types emitted. + When **id** is specified, BTF object with that ID will be loaded and all + its BTF types emitted. - When **map** is provided, it's expected that map has - associated BTF object with BTF types describing key and - value. It's possible to select whether to dump only BTF - type(s) associated with key (**key**), value (**value**), - both key and value (**kv**), or all BTF types present in - associated BTF object (**all**). If not specified, **kv** - is assumed. + When **map** is provided, it's expected that map has associated BTF object + with BTF types describing key and value. It's possible to select whether to + dump only BTF type(s) associated with key (**key**), value (**value**), + both key and value (**kv**), or all BTF types present in associated BTF + object (**all**). If not specified, **kv** is assumed. - When **prog** is provided, it's expected that program has - associated BTF object with BTF types. + When **prog** is provided, it's expected that program has associated BTF + object with BTF types. - When specifying *FILE*, an ELF file is expected, containing - .BTF section with well-defined BTF binary format data, - typically produced by clang or pahole. + When specifying *FILE*, an ELF file is expected, containing .BTF section + with well-defined BTF binary format data, typically produced by clang or + pahole. - **format** option can be used to override default (raw) - output format. Raw (**raw**) or C-syntax (**c**) output - formats are supported. + **format** option can be used to override default (raw) output format. Raw + (**raw**) or C-syntax (**c**) output formats are supported. - **bpftool btf help** - Print short help message. +bpftool btf help + Print short help message. OPTIONS ======= - .. include:: common_options.rst - - -B, --base-btf *FILE* - Pass a base BTF object. Base BTF objects are typically used - with BTF objects for kernel modules. To avoid duplicating - all kernel symbols required by modules, BTF objects for - modules are "split", they are built incrementally on top of - the kernel (vmlinux) BTF object. So the base BTF reference - should usually point to the kernel BTF. - - When the main BTF object to process (for example, the - module BTF to dump) is passed as a *FILE*, bpftool attempts - to autodetect the path for the base object, and passing - this option is optional. When the main BTF object is passed - through other handles, this option becomes necessary. +.. include:: common_options.rst + +-B, --base-btf *FILE* + Pass a base BTF object. Base BTF objects are typically used with BTF + objects for kernel modules. To avoid duplicating all kernel symbols + required by modules, BTF objects for modules are "split", they are + built incrementally on top of the kernel (vmlinux) BTF object. So the + base BTF reference should usually point to the kernel BTF. + + When the main BTF object to process (for example, the module BTF to + dump) is passed as a *FILE*, bpftool attempts to autodetect the path + for the base object, and passing this option is optional. When the main + BTF object is passed through other handles, this option becomes + necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 2ce900f66d6e..e8185596a759 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -14,134 +14,125 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **cgroup** *COMMAND* +**bpftool** [*OPTIONS*] **cgroup** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } - *COMMANDS* := - { **show** | **list** | **tree** | **attach** | **detach** | **help** } +*COMMANDS* := +{ **show** | **list** | **tree** | **attach** | **detach** | **help** } CGROUP COMMANDS =============== -| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] -| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] -| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] -| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* -| **bpftool** **cgroup help** +| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] +| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] +| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] +| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* +| **bpftool** **cgroup help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | -| **cgroup_inet_sock_create** | **cgroup_sock_ops** | -| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | -| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | -| **cgroup_inet4_connect** | **cgroup_inet6_connect** | -| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | -| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | -| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | -| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | -| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | -| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | -| **cgroup_unix_recvmsg** | **cgroup_sysctl** | -| **cgroup_getsockopt** | **cgroup_setsockopt** | -| **cgroup_inet_sock_release** } -| *ATTACH_FLAGS* := { **multi** | **override** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | +| **cgroup_inet_sock_create** | **cgroup_sock_ops** | +| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | +| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | +| **cgroup_inet4_connect** | **cgroup_inet6_connect** | +| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | +| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | +| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | +| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | +| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | +| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | +| **cgroup_unix_recvmsg** | **cgroup_sysctl** | +| **cgroup_getsockopt** | **cgroup_setsockopt** | +| **cgroup_inet_sock_release** } +| *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION =========== - **bpftool cgroup { show | list }** *CGROUP* [**effective**] - List all programs attached to the cgroup *CGROUP*. - - Output will start with program ID followed by attach type, - attach flags and program name. - - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. - - **bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] - Iterate over all cgroups in *CGROUP_ROOT* and list all - attached programs. If *CGROUP_ROOT* is not specified, - bpftool uses cgroup v2 mountpoint. - - The output is similar to the output of cgroup show/list - commands: it starts with absolute cgroup path, followed by - program ID, attach type, attach flags and program name. - - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. - - **bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] - Attach program *PROG* to the cgroup *CGROUP* with attach type - *ATTACH_TYPE* and optional *ATTACH_FLAGS*. - - *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs - some bpf program, the program in this cgroup yields to sub-cgroup - program; **multi** if a sub-cgroup installs some bpf program, - that cgroup program gets run in addition to the program in this - cgroup. - - Only one program is allowed to be attached to a cgroup with - no attach flags or the **override** flag. Attaching another - program will release old program and attach the new one. - - Multiple programs are allowed to be attached to a cgroup with - **multi**. They are executed in FIFO order (those that were - attached first, run first). - - Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 - and later. - - *ATTACH_TYPE* can be on of: - **ingress** ingress path of the inet socket (since 4.10); - **egress** egress path of the inet socket (since 4.10); - **sock_create** opening of an inet socket (since 4.10); - **sock_ops** various socket operations (since 4.12); - **device** device access (since 4.15); - **bind4** call to bind(2) for an inet4 socket (since 4.17); - **bind6** call to bind(2) for an inet6 socket (since 4.17); - **post_bind4** return from bind(2) for an inet4 socket (since 4.17); - **post_bind6** return from bind(2) for an inet6 socket (since 4.17); - **connect4** call to connect(2) for an inet4 socket (since 4.17); - **connect6** call to connect(2) for an inet6 socket (since 4.17); - **connect_unix** call to connect(2) for a unix socket (since 6.7); - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp4 socket (since 4.18); - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp6 socket (since 4.18); - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for - an unconnected unix socket (since 6.7); - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp4 socket (since 5.2); - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp6 socket (since 5.2); - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected unix socket (since 6.7); - **sysctl** sysctl access (since 5.2); - **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3); - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); - **sock_release** closing an userspace inet socket (since 5.9). - - **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* - Detach *PROG* from the cgroup *CGROUP* and attach type - *ATTACH_TYPE*. - - **bpftool prog help** - Print short help message. +bpftool cgroup { show | list } *CGROUP* [effective] + List all programs attached to the cgroup *CGROUP*. + + Output will start with program ID followed by attach type, attach flags and + program name. + + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. + +bpftool cgroup tree [*CGROUP_ROOT*] [effective] + Iterate over all cgroups in *CGROUP_ROOT* and list all attached programs. + If *CGROUP_ROOT* is not specified, bpftool uses cgroup v2 mountpoint. + + The output is similar to the output of cgroup show/list commands: it starts + with absolute cgroup path, followed by program ID, attach type, attach + flags and program name. + + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. + +bpftool cgroup attach *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] + Attach program *PROG* to the cgroup *CGROUP* with attach type *ATTACH_TYPE* + and optional *ATTACH_FLAGS*. + + *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs some + bpf program, the program in this cgroup yields to sub-cgroup program; + **multi** if a sub-cgroup installs some bpf program, that cgroup program + gets run in addition to the program in this cgroup. + + Only one program is allowed to be attached to a cgroup with no attach flags + or the **override** flag. Attaching another program will release old + program and attach the new one. + + Multiple programs are allowed to be attached to a cgroup with **multi**. + They are executed in FIFO order (those that were attached first, run + first). + + Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 and later. + + *ATTACH_TYPE* can be one of: + + - **ingress** ingress path of the inet socket (since 4.10) + - **egress** egress path of the inet socket (since 4.10) + - **sock_create** opening of an inet socket (since 4.10) + - **sock_ops** various socket operations (since 4.12) + - **device** device access (since 4.15) + - **bind4** call to bind(2) for an inet4 socket (since 4.17) + - **bind6** call to bind(2) for an inet6 socket (since 4.17) + - **post_bind4** return from bind(2) for an inet4 socket (since 4.17) + - **post_bind6** return from bind(2) for an inet6 socket (since 4.17) + - **connect4** call to connect(2) for an inet4 socket (since 4.17) + - **connect6** call to connect(2) for an inet6 socket (since 4.17) + - **connect_unix** call to connect(2) for a unix socket (since 6.7) + - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp4 socket (since 4.18) + - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp6 socket (since 4.18) + - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected unix socket (since 6.7) + - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp4 socket (since 5.2) + - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp6 socket (since 5.2) + - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected unix socket (since 6.7) + - **sysctl** sysctl access (since 5.2) + - **getsockopt** call to getsockopt (since 5.3) + - **setsockopt** call to setsockopt (since 5.3) + - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8) + - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8) + - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7) + - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8) + - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8) + - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7) + - **sock_release** closing a userspace inet socket (since 5.9) + +bpftool cgroup detach *CGROUP* *ATTACH_TYPE* *PROG* + Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. + +bpftool prog help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned programs. +-f, --bpffs + Show file names of pinned programs. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index e44039f89be7..c7f837898bc7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -14,77 +14,70 @@ tool for inspection of eBPF-related parameters for Linux kernel or net device SYNOPSIS ======== - **bpftool** [*OPTIONS*] **feature** *COMMAND* +**bpftool** [*OPTIONS*] **feature** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **probe** | **help** } +*COMMANDS* := { **probe** | **help** } FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] -| **bpftool** **feature list_builtins** *GROUP* -| **bpftool** **feature help** +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature list_builtins** *GROUP* +| **bpftool** **feature help** | -| *COMPONENT* := { **kernel** | **dev** *NAME* } -| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } +| *COMPONENT* := { **kernel** | **dev** *NAME* } +| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } DESCRIPTION =========== - **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] - Probe the running kernel and dump a number of eBPF-related - parameters, such as availability of the **bpf**\ () system call, - JIT status, eBPF program types availability, eBPF helper - functions availability, and more. - - By default, bpftool **does not run probes** for - **bpf_probe_write_user**\ () and **bpf_trace_printk**\() - helpers which print warnings to kernel logs. To enable them - and run all probes, the **full** keyword should be used. - - If the **macros** keyword (but not the **-j** option) is - passed, a subset of the output is dumped as a list of - **#define** macros that are ready to be included in a C - header file, for example. If, additionally, **prefix** is - used to define a *PREFIX*, the provided string will be used - as a prefix to the names of the macros: this can be used to - avoid conflicts on macro names when including the output of - this command as a header file. - - Keyword **kernel** can be omitted. If no probe target is - specified, probing the kernel is the default behaviour. - - When the **unprivileged** keyword is used, bpftool will dump - only the features available to a user who does not have the - **CAP_SYS_ADMIN** capability set. The features available in - that case usually represent a small subset of the parameters - supported by the system. Unprivileged users MUST use the - **unprivileged** keyword: This is to avoid misdetection if - bpftool is inadvertently run as non-root, for example. This - keyword is unavailable if bpftool was compiled without - libcap. - - **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] - Probe network device for supported eBPF features and dump - results to the console. - - The keywords **full**, **macros** and **prefix** have the - same role as when probing the kernel. - - **bpftool feature list_builtins** *GROUP* - List items known to bpftool. These can be BPF program types - (**prog_types**), BPF map types (**map_types**), attach types - (**attach_types**), link types (**link_types**), or BPF helper - functions (**helpers**). The command does not probe the system, but - simply lists the elements that bpftool knows from compilation time, - as provided from libbpf (for all object types) or from the BPF UAPI - header (list of helpers). This can be used in scripts to iterate over - BPF types or helpers. - - **bpftool feature help** - Print short help message. +bpftool feature probe [kernel] [full] [macros [prefix *PREFIX*]] + Probe the running kernel and dump a number of eBPF-related parameters, such + as availability of the **bpf**\ () system call, JIT status, eBPF program + types availability, eBPF helper functions availability, and more. + + By default, bpftool **does not run probes** for **bpf_probe_write_user**\ + () and **bpf_trace_printk**\() helpers which print warnings to kernel logs. + To enable them and run all probes, the **full** keyword should be used. + + If the **macros** keyword (but not the **-j** option) is passed, a subset + of the output is dumped as a list of **#define** macros that are ready to + be included in a C header file, for example. If, additionally, **prefix** + is used to define a *PREFIX*, the provided string will be used as a prefix + to the names of the macros: this can be used to avoid conflicts on macro + names when including the output of this command as a header file. + + Keyword **kernel** can be omitted. If no probe target is specified, probing + the kernel is the default behaviour. + + When the **unprivileged** keyword is used, bpftool will dump only the + features available to a user who does not have the **CAP_SYS_ADMIN** + capability set. The features available in that case usually represent a + small subset of the parameters supported by the system. Unprivileged users + MUST use the **unprivileged** keyword: This is to avoid misdetection if + bpftool is inadvertently run as non-root, for example. This keyword is + unavailable if bpftool was compiled without libcap. + +bpftool feature probe dev *NAME* [full] [macros [prefix *PREFIX*]] + Probe network device for supported eBPF features and dump results to the + console. + + The keywords **full**, **macros** and **prefix** have the same role as when + probing the kernel. + +bpftool feature list_builtins *GROUP* + List items known to bpftool. These can be BPF program types + (**prog_types**), BPF map types (**map_types**), attach types + (**attach_types**), link types (**link_types**), or BPF helper functions + (**helpers**). The command does not probe the system, but simply lists the + elements that bpftool knows from compilation time, as provided from libbpf + (for all object types) or from the BPF UAPI header (list of helpers). This + can be used in scripts to iterate over BPF types or helpers. + +bpftool feature help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 5e60825818dd..c768e6d4ae09 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,199 +14,177 @@ tool for BPF code-generation SYNOPSIS ======== - **bpftool** [*OPTIONS*] **gen** *COMMAND* +**bpftool** [*OPTIONS*] **gen** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } - *COMMAND* := { **object** | **skeleton** | **help** } +*COMMAND* := { **object** | **skeleton** | **help** } GEN COMMANDS ============= -| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] -| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] -| **bpftool** **gen help** +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] +| **bpftool** **gen help** DESCRIPTION =========== - **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] - Statically link (combine) together one or more *INPUT_FILE*'s - into a single resulting *OUTPUT_FILE*. All the files involved - are BPF ELF object files. - - The rules of BPF static linking are mostly the same as for - user-space object files, but in addition to combining data - and instruction sections, .BTF and .BTF.ext (if present in - any of the input files) data are combined together. .BTF - data is deduplicated, so all the common types across - *INPUT_FILE*'s will only be represented once in the resulting - BTF information. - - BPF static linking allows to partition BPF source code into - individually compiled files that are then linked into - a single resulting BPF object file, which can be used to - generated BPF skeleton (with **gen skeleton** command) or - passed directly into **libbpf** (using **bpf_object__open()** - family of APIs). - - **bpftool gen skeleton** *FILE* - Generate BPF skeleton C header file for a given *FILE*. - - BPF skeleton is an alternative interface to existing libbpf - APIs for working with BPF objects. Skeleton code is intended - to significantly shorten and simplify code to load and work - with BPF programs from userspace side. Generated code is - tailored to specific input BPF object *FILE*, reflecting its - structure by listing out available maps, program, variables, - etc. Skeleton eliminates the need to lookup mentioned - components by name. Instead, if skeleton instantiation - succeeds, they are populated in skeleton structure as valid - libbpf types (e.g., **struct bpf_map** pointer) and can be - passed to existing generic libbpf APIs. - - In addition to simple and reliable access to maps and - programs, skeleton provides a storage for BPF links (**struct - bpf_link**) for each BPF program within BPF object. When - requested, supported BPF programs will be automatically - attached and resulting BPF links stored for further use by - user in pre-allocated fields in skeleton struct. For BPF - programs that can't be automatically attached by libbpf, - user can attach them manually, but store resulting BPF link - in per-program link field. All such set up links will be - automatically destroyed on BPF skeleton destruction. This - eliminates the need for users to manage links manually and - rely on libbpf support to detach programs and free up - resources. - - Another facility provided by BPF skeleton is an interface to - global variables of all supported kinds: mutable, read-only, - as well as extern ones. This interface allows to pre-setup - initial values of variables before BPF object is loaded and - verified by kernel. For non-read-only variables, the same - interface can be used to fetch values of global variables on - userspace side, even if they are modified by BPF code. - - During skeleton generation, contents of source BPF object - *FILE* is embedded within generated code and is thus not - necessary to keep around. This ensures skeleton and BPF - object file are matching 1-to-1 and always stay in sync. - Generated code is dual-licensed under LGPL-2.1 and - BSD-2-Clause licenses. - - It is a design goal and guarantee that skeleton interfaces - are interoperable with generic libbpf APIs. User should - always be able to use skeleton API to create and load BPF - object, and later use libbpf APIs to keep working with - specific maps, programs, etc. - - As part of skeleton, few custom functions are generated. - Each of them is prefixed with object name. Object name can - either be derived from object file name, i.e., if BPF object - file name is **example.o**, BPF object name will be - **example**. Object name can be also specified explicitly - through **name** *OBJECT_NAME* parameter. The following - custom functions are provided (assuming **example** as - the object name): - - - **example__open** and **example__open_opts**. - These functions are used to instantiate skeleton. It - corresponds to libbpf's **bpf_object__open**\ () API. - **_opts** variants accepts extra **bpf_object_open_opts** - options. - - - **example__load**. - This function creates maps, loads and verifies BPF - programs, initializes global data maps. It corresponds to - libppf's **bpf_object__load**\ () API. - - - **example__open_and_load** combines **example__open** and - **example__load** invocations in one commonly used - operation. - - - **example__attach** and **example__detach** - This pair of functions allow to attach and detach, - correspondingly, already loaded BPF object. Only BPF - programs of types supported by libbpf for auto-attachment - will be auto-attached and their corresponding BPF links - instantiated. For other BPF programs, user can manually - create a BPF link and assign it to corresponding fields in - skeleton struct. **example__detach** will detach both - links created automatically, as well as those populated by - user manually. - - - **example__destroy** - Detach and unload BPF programs, free up all the resources - used by skeleton and BPF object. - - If BPF object has global variables, corresponding structs - with memory layout corresponding to global data data section - layout will be created. Currently supported ones are: *.data*, - *.bss*, *.rodata*, and *.kconfig* structs/data sections. - These data sections/structs can be used to set up initial - values of variables, if set before **example__load**. - Afterwards, if target kernel supports memory-mapped BPF - arrays, same structs can be used to fetch and update - (non-read-only) data from userspace, with same simplicity - as for BPF side. - - **bpftool gen subskeleton** *FILE* - Generate BPF subskeleton C header file for a given *FILE*. - - Subskeletons are similar to skeletons, except they do not own - the corresponding maps, programs, or global variables. They - require that the object file used to generate them is already - loaded into a *bpf_object* by some other means. - - This functionality is useful when a library is included into a - larger BPF program. A subskeleton for the library would have - access to all objects and globals defined in it, without - having to know about the larger program. - - Consequently, there are only two functions defined - for subskeletons: - - - **example__open(bpf_object\*)** - Instantiates a subskeleton from an already opened (but not - necessarily loaded) **bpf_object**. - - - **example__destroy()** - Frees the storage for the subskeleton but *does not* unload - any BPF programs or maps. - - **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] - Generate a minimum BTF file as *OUTPUT*, derived from a given - *INPUT* BTF file, containing all needed BTF types so one, or - more, given eBPF objects CO-RE relocations may be satisfied. - - When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, - libbpf, when loading an eBPF object, has to rely on external - BTF files to be able to calculate CO-RE relocations. - - Usually, an external BTF file is built from existing kernel - DWARF data using pahole. It contains all the types used by - its respective kernel image and, because of that, is big. - - The min_core_btf feature builds smaller BTF files, customized - to one or multiple eBPF objects, so they can be distributed - together with an eBPF CO-RE based application, turning the - application portable to different kernel versions. - - Check examples bellow for more information how to use it. - - **bpftool gen help** - Print short help message. +bpftool gen object *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s into a single + resulting *OUTPUT_FILE*. All the files involved are BPF ELF object files. + + The rules of BPF static linking are mostly the same as for user-space + object files, but in addition to combining data and instruction sections, + .BTF and .BTF.ext (if present in any of the input files) data are combined + together. .BTF data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting BTF + information. + + BPF static linking allows to partition BPF source code into individually + compiled files that are then linked into a single resulting BPF object + file, which can be used to generated BPF skeleton (with **gen skeleton** + command) or passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). + +bpftool gen skeleton *FILE* + Generate BPF skeleton C header file for a given *FILE*. + + BPF skeleton is an alternative interface to existing libbpf APIs for + working with BPF objects. Skeleton code is intended to significantly + shorten and simplify code to load and work with BPF programs from userspace + side. Generated code is tailored to specific input BPF object *FILE*, + reflecting its structure by listing out available maps, program, variables, + etc. Skeleton eliminates the need to lookup mentioned components by name. + Instead, if skeleton instantiation succeeds, they are populated in skeleton + structure as valid libbpf types (e.g., **struct bpf_map** pointer) and can + be passed to existing generic libbpf APIs. + + In addition to simple and reliable access to maps and programs, skeleton + provides a storage for BPF links (**struct bpf_link**) for each BPF program + within BPF object. When requested, supported BPF programs will be + automatically attached and resulting BPF links stored for further use by + user in pre-allocated fields in skeleton struct. For BPF programs that + can't be automatically attached by libbpf, user can attach them manually, + but store resulting BPF link in per-program link field. All such set up + links will be automatically destroyed on BPF skeleton destruction. This + eliminates the need for users to manage links manually and rely on libbpf + support to detach programs and free up resources. + + Another facility provided by BPF skeleton is an interface to global + variables of all supported kinds: mutable, read-only, as well as extern + ones. This interface allows to pre-setup initial values of variables before + BPF object is loaded and verified by kernel. For non-read-only variables, + the same interface can be used to fetch values of global variables on + userspace side, even if they are modified by BPF code. + + During skeleton generation, contents of source BPF object *FILE* is + embedded within generated code and is thus not necessary to keep around. + This ensures skeleton and BPF object file are matching 1-to-1 and always + stay in sync. Generated code is dual-licensed under LGPL-2.1 and + BSD-2-Clause licenses. + + It is a design goal and guarantee that skeleton interfaces are + interoperable with generic libbpf APIs. User should always be able to use + skeleton API to create and load BPF object, and later use libbpf APIs to + keep working with specific maps, programs, etc. + + As part of skeleton, few custom functions are generated. Each of them is + prefixed with object name. Object name can either be derived from object + file name, i.e., if BPF object file name is **example.o**, BPF object name + will be **example**. Object name can be also specified explicitly through + **name** *OBJECT_NAME* parameter. The following custom functions are + provided (assuming **example** as the object name): + + - **example__open** and **example__open_opts**. + These functions are used to instantiate skeleton. It corresponds to + libbpf's **bpf_object__open**\ () API. **_opts** variants accepts extra + **bpf_object_open_opts** options. + + - **example__load**. + This function creates maps, loads and verifies BPF programs, initializes + global data maps. It corresponds to libppf's **bpf_object__load**\ () + API. + + - **example__open_and_load** combines **example__open** and + **example__load** invocations in one commonly used operation. + + - **example__attach** and **example__detach**. + This pair of functions allow to attach and detach, correspondingly, + already loaded BPF object. Only BPF programs of types supported by libbpf + for auto-attachment will be auto-attached and their corresponding BPF + links instantiated. For other BPF programs, user can manually create a + BPF link and assign it to corresponding fields in skeleton struct. + **example__detach** will detach both links created automatically, as well + as those populated by user manually. + + - **example__destroy**. + Detach and unload BPF programs, free up all the resources used by + skeleton and BPF object. + + If BPF object has global variables, corresponding structs with memory + layout corresponding to global data data section layout will be created. + Currently supported ones are: *.data*, *.bss*, *.rodata*, and *.kconfig* + structs/data sections. These data sections/structs can be used to set up + initial values of variables, if set before **example__load**. Afterwards, + if target kernel supports memory-mapped BPF arrays, same structs can be + used to fetch and update (non-read-only) data from userspace, with same + simplicity as for BPF side. + +bpftool gen subskeleton *FILE* + Generate BPF subskeleton C header file for a given *FILE*. + + Subskeletons are similar to skeletons, except they do not own the + corresponding maps, programs, or global variables. They require that the + object file used to generate them is already loaded into a *bpf_object* by + some other means. + + This functionality is useful when a library is included into a larger BPF + program. A subskeleton for the library would have access to all objects and + globals defined in it, without having to know about the larger program. + + Consequently, there are only two functions defined for subskeletons: + + - **example__open(bpf_object\*)**. + Instantiates a subskeleton from an already opened (but not necessarily + loaded) **bpf_object**. + + - **example__destroy()**. + Frees the storage for the subskeleton but *does not* unload any BPF + programs or maps. + +bpftool gen min_core_btf *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] + Generate a minimum BTF file as *OUTPUT*, derived from a given *INPUT* BTF + file, containing all needed BTF types so one, or more, given eBPF objects + CO-RE relocations may be satisfied. + + When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, libbpf, when + loading an eBPF object, has to rely on external BTF files to be able to + calculate CO-RE relocations. + + Usually, an external BTF file is built from existing kernel DWARF data + using pahole. It contains all the types used by its respective kernel image + and, because of that, is big. + + The min_core_btf feature builds smaller BTF files, customized to one or + multiple eBPF objects, so they can be distributed together with an eBPF + CO-RE based application, turning the application portable to different + kernel versions. + + Check examples bellow for more information how to use it. + +bpftool gen help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -L, --use-loader - For skeletons, generate a "light" skeleton (also known as "loader" - skeleton). A light skeleton contains a loader eBPF program. It does - not use the majority of the libbpf infrastructure, and does not need - libelf. +-L, --use-loader + For skeletons, generate a "light" skeleton (also known as "loader" + skeleton). A light skeleton contains a loader eBPF program. It does not use + the majority of the libbpf infrastructure, and does not need libelf. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 84839d488621..2e5d81c906dc 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -14,50 +14,46 @@ tool to create BPF iterators SYNOPSIS ======== - **bpftool** [*OPTIONS*] **iter** *COMMAND* +**bpftool** [*OPTIONS*] **iter** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **pin** | **help** } +*COMMANDS* := { **pin** | **help** } ITER COMMANDS -=================== +============= -| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] -| **bpftool** **iter help** +| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] +| **bpftool** **iter help** | -| *OBJ* := /a/file/of/bpf_iter_target.o -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *OBJ* := /a/file/of/bpf_iter_target.o +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] - A bpf iterator combines a kernel iterating of - particular kernel data (e.g., tasks, bpf_maps, etc.) - and a bpf program called for each kernel data object - (e.g., one task, one bpf_map, etc.). User space can - *read* kernel iterator output through *read()* syscall. - - The *pin* command creates a bpf iterator from *OBJ*, - and pin it to *PATH*. The *PATH* should be located - in *bpffs* mount. It must not contain a dot - character ('.'), which is reserved for future extensions - of *bpffs*. - - Map element bpf iterator requires an additional parameter - *MAP* so bpf program can iterate over map elements for - that map. User can have a bpf program in kernel to run - with each map element, do checking, filtering, aggregation, - etc. without copying data to user space. - - User can then *cat PATH* to see the bpf iterator output. - - **bpftool iter help** - Print short help message. +bpftool iter pin *OBJ* *PATH* [map *MAP*] + A bpf iterator combines a kernel iterating of particular kernel data (e.g., + tasks, bpf_maps, etc.) and a bpf program called for each kernel data object + (e.g., one task, one bpf_map, etc.). User space can *read* kernel iterator + output through *read()* syscall. + + The *pin* command creates a bpf iterator from *OBJ*, and pin it to *PATH*. + The *PATH* should be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + + Map element bpf iterator requires an additional parameter *MAP* so bpf + program can iterate over map elements for that map. User can have a bpf + program in kernel to run with each map element, do checking, filtering, + aggregation, etc. without copying data to user space. + + User can then *cat PATH* to see the bpf iterator output. + +bpftool iter help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 52a4eee4af54..6f09d4405ed8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -14,67 +14,62 @@ tool for inspection and simple manipulation of eBPF links SYNOPSIS ======== - **bpftool** [*OPTIONS*] **link** *COMMAND* +**bpftool** [*OPTIONS*] **link** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := { **show** | **list** | **pin** | **help** } +*COMMANDS* := { **show** | **list** | **pin** | **help** } LINK COMMANDS ============= -| **bpftool** **link { show | list }** [*LINK*] -| **bpftool** **link pin** *LINK* *FILE* -| **bpftool** **link detach** *LINK* -| **bpftool** **link help** +| **bpftool** **link { show | list }** [*LINK*] +| **bpftool** **link pin** *LINK* *FILE* +| **bpftool** **link detach** *LINK* +| **bpftool** **link help** | -| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } +| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool link { show | list }** [*LINK*] - Show information about active links. If *LINK* is - specified show information only about given link, - otherwise list all links currently active on the system. +bpftool link { show | list } [*LINK*] + Show information about active links. If *LINK* is specified show + information only about given link, otherwise list all links currently + active on the system. - Output will start with link ID followed by link type and - zero or more named attributes, some of which depend on type - of link. + Output will start with link ID followed by link type and zero or more named + attributes, some of which depend on type of link. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - links. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF links. On such kernels + bpftool will automatically emit this information as well. - **bpftool link pin** *LINK* *FILE* - Pin link *LINK* as *FILE*. +bpftool link pin *LINK* *FILE* + Pin link *LINK* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool link detach** *LINK* - Force-detach link *LINK*. BPF link and its underlying BPF - program will stay valid, but they will be detached from the - respective BPF hook and BPF link will transition into - a defunct state until last open file descriptor for that - link is closed. +bpftool link detach *LINK* + Force-detach link *LINK*. BPF link and its underlying BPF program will stay + valid, but they will be detached from the respective BPF hook and BPF link + will transition into a defunct state until last open file descriptor for + that link is closed. - **bpftool link help** - Print short help message. +bpftool link help + Print short help message. OPTIONS ======= - .. include:: common_options.rst + .. include:: common_options.rst - -f, --bpffs - When showing BPF links, show file names of pinned - links. + -f, --bpffs + When showing BPF links, show file names of pinned links. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. + -n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 9d6a314dfd7a..252e4c538edb 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -14,166 +14,160 @@ tool for inspection and simple manipulation of eBPF maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] **map** *COMMAND* +**bpftool** [*OPTIONS*] **map** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **help** } +*COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **help** } MAP COMMANDS ============= -| **bpftool** **map** { **show** | **list** } [*MAP*] -| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ -| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ -| [**offload_dev** *NAME*] -| **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* [**key** *DATA*] -| **bpftool** **map getnext** *MAP* [**key** *DATA*] -| **bpftool** **map delete** *MAP* **key** *DATA* -| **bpftool** **map pin** *MAP* *FILE* -| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] -| **bpftool** **map peek** *MAP* -| **bpftool** **map push** *MAP* **value** *VALUE* -| **bpftool** **map pop** *MAP* -| **bpftool** **map enqueue** *MAP* **value** *VALUE* -| **bpftool** **map dequeue** *MAP* -| **bpftool** **map freeze** *MAP* -| **bpftool** **map help** +| **bpftool** **map** { **show** | **list** } [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ +| [**offload_dev** *NAME*] +| **bpftool** **map dump** *MAP* +| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* [**key** *DATA*] +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* +| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +| **bpftool** **map peek** *MAP* +| **bpftool** **map push** *MAP* **value** *VALUE* +| **bpftool** **map pop** *MAP* +| **bpftool** **map enqueue** *MAP* **value** *VALUE* +| **bpftool** **map dequeue** *MAP* +| **bpftool** **map freeze** *MAP* +| **bpftool** **map help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } -| *DATA* := { [**hex**] *BYTES* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *VALUE* := { *DATA* | *MAP* | *PROG* } -| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } -| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** -| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** -| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** -| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** -| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *DATA* := { [**hex**] *BYTES* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *VALUE* := { *DATA* | *MAP* | *PROG* } +| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } DESCRIPTION =========== - **bpftool map { show | list }** [*MAP*] - Show information about loaded maps. If *MAP* is specified - show information only about given maps, otherwise list all - maps currently loaded on the system. In case of **name**, - *MAP* may match several maps which will all be shown. +bpftool map { show | list } [*MAP*] + Show information about loaded maps. If *MAP* is specified show information + only about given maps, otherwise list all maps currently loaded on the + system. In case of **name**, *MAP* may match several maps which will all + be shown. - Output will start with map ID followed by map type and - zero or more named attributes (depending on kernel version). + Output will start with map ID followed by map type and zero or more named + attributes (depending on kernel version). - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - maps. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF maps. On such kernels + bpftool will automatically emit this information as well. - **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] - Create a new map with given parameters and pin it to *bpffs* - as *FILE*. +bpftool map create *FILE* type *TYPE* key *KEY_SIZE* value *VALUE_SIZE* entries *MAX_ENTRIES* name *NAME* [flags *FLAGS*] [inner_map *MAP*] [offload_dev *NAME*] + Create a new map with given parameters and pin it to *bpffs* as *FILE*. - *FLAGS* should be an integer which is the combination of - desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h - UAPI header for existing flags). + *FLAGS* should be an integer which is the combination of desired flags, + e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h UAPI header for existing + flags). - To create maps of type array-of-maps or hash-of-maps, the - **inner_map** keyword must be used to pass an inner map. The - kernel needs it to collect metadata related to the inner maps - that the new map will work with. + To create maps of type array-of-maps or hash-of-maps, the **inner_map** + keyword must be used to pass an inner map. The kernel needs it to collect + metadata related to the inner maps that the new map will work with. - Keyword **offload_dev** expects a network interface name, - and is used to request hardware offload for the map. + Keyword **offload_dev** expects a network interface name, and is used to + request hardware offload for the map. - **bpftool map dump** *MAP* - Dump all entries in a given *MAP*. In case of **name**, - *MAP* may match several maps which will all be dumped. +bpftool map dump *MAP* + Dump all entries in a given *MAP*. In case of **name**, *MAP* may match + several maps which will all be dumped. - **bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] - Update map entry for a given *KEY*. +bpftool map update *MAP* [key *DATA*] [value *VALUE*] [*UPDATE_FLAGS*] + Update map entry for a given *KEY*. - *UPDATE_FLAGS* can be one of: **any** update existing entry - or add if doesn't exit; **exist** update only if entry already - exists; **noexist** update only if entry doesn't exist. + *UPDATE_FLAGS* can be one of: **any** update existing entry or add if + doesn't exit; **exist** update only if entry already exists; **noexist** + update only if entry doesn't exist. - If the **hex** keyword is provided in front of the bytes - sequence, the bytes are parsed as hexadecimal values, even if - no "0x" prefix is added. If the keyword is not provided, then - the bytes are parsed as decimal values, unless a "0x" prefix - (for hexadecimal) or a "0" prefix (for octal) is provided. + If the **hex** keyword is provided in front of the bytes sequence, the + bytes are parsed as hexadecimal values, even if no "0x" prefix is added. If + the keyword is not provided, then the bytes are parsed as decimal values, + unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is + provided. - **bpftool map lookup** *MAP* [**key** *DATA*] - Lookup **key** in the map. +bpftool map lookup *MAP* [key *DATA*] + Lookup **key** in the map. - **bpftool map getnext** *MAP* [**key** *DATA*] - Get next key. If *key* is not specified, get first key. +bpftool map getnext *MAP* [key *DATA*] + Get next key. If *key* is not specified, get first key. - **bpftool map delete** *MAP* **key** *DATA* - Remove entry from the map. +bpftool map delete *MAP* key *DATA* + Remove entry from the map. - **bpftool map pin** *MAP* *FILE* - Pin map *MAP* as *FILE*. +bpftool map pin *MAP* *FILE* + Pin map *MAP* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] - Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. +bpftool map event_pipe *MAP* [cpu *N* index *M*] + Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. - Install perf rings into a perf event array map and dump - output of any **bpf_perf_event_output**\ () call in the kernel. - By default read the number of CPUs on the system and - install perf ring for each CPU in the corresponding index - in the array. + Install perf rings into a perf event array map and dump output of any + **bpf_perf_event_output**\ () call in the kernel. By default read the + number of CPUs on the system and install perf ring for each CPU in the + corresponding index in the array. - If **cpu** and **index** are specified, install perf ring - for given **cpu** at **index** in the array (single ring). + If **cpu** and **index** are specified, install perf ring for given **cpu** + at **index** in the array (single ring). - Note that installing a perf ring into an array will silently - replace any existing ring. Any other application will stop - receiving events if it installed its rings earlier. + Note that installing a perf ring into an array will silently replace any + existing ring. Any other application will stop receiving events if it + installed its rings earlier. - **bpftool map peek** *MAP* - Peek next value in the queue or stack. +bpftool map peek *MAP* + Peek next value in the queue or stack. - **bpftool map push** *MAP* **value** *VALUE* - Push *VALUE* onto the stack. +bpftool map push *MAP* value *VALUE* + Push *VALUE* onto the stack. - **bpftool map pop** *MAP* - Pop and print value from the stack. +bpftool map pop *MAP* + Pop and print value from the stack. - **bpftool map enqueue** *MAP* **value** *VALUE* - Enqueue *VALUE* into the queue. +bpftool map enqueue *MAP* value *VALUE* + Enqueue *VALUE* into the queue. - **bpftool map dequeue** *MAP* - Dequeue and print value from the queue. +bpftool map dequeue *MAP* + Dequeue and print value from the queue. - **bpftool map freeze** *MAP* - Freeze the map as read-only from user space. Entries from a - frozen map can not longer be updated or deleted with the - **bpf**\ () system call. This operation is not reversible, - and the map remains immutable from user space until its - destruction. However, read and write permissions for BPF - programs to the map remain unchanged. +bpftool map freeze *MAP* + Freeze the map as read-only from user space. Entries from a frozen map can + not longer be updated or deleted with the **bpf**\ () system call. This + operation is not reversible, and the map remains immutable from user space + until its destruction. However, read and write permissions for BPF programs + to the map remain unchanged. - **bpftool map help** - Print short help message. +bpftool map help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned maps. +-f, --bpffs + Show file names of pinned maps. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index dd3f9469765b..348812881297 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -14,76 +14,74 @@ tool for inspection of networking related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **net** *COMMAND* +**bpftool** [*OPTIONS*] **net** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **attach** | **detach** | **help** } +*COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } NET COMMANDS ============ -| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] -| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] -| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* -| **bpftool** **net help** +| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* +| **bpftool** **net help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } DESCRIPTION =========== - **bpftool net { show | list }** [ **dev** *NAME* ] - List bpf program attachments in the kernel networking subsystem. - - Currently, device driver xdp attachments, tcx, netkit and old-style tc - classifier/action attachments, flow_dissector as well as netfilter - attachments are implemented, i.e., for - program types **BPF_PROG_TYPE_XDP**, **BPF_PROG_TYPE_SCHED_CLS**, - **BPF_PROG_TYPE_SCHED_ACT**, **BPF_PROG_TYPE_FLOW_DISSECTOR**, - **BPF_PROG_TYPE_NETFILTER**. - - For programs attached to a particular cgroup, e.g., - **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, - **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, - users can use **bpftool cgroup** to dump cgroup attachments. - For sk_{filter, skb, msg, reuseport} and lwt/seg6 - bpf programs, users should consult other tools, e.g., iproute2. - - The current output will start with all xdp program attachments, followed by - all tcx, netkit, then tc class/qdisc bpf program attachments, then flow_dissector - and finally netfilter programs. Both xdp programs and tcx/netkit/tc programs are - ordered based on ifindex number. If multiple bpf programs attached - to the same networking device through **tc**, the order will be first - all bpf programs attached to tcx, netkit, then tc classes, then all bpf programs - attached to non clsact qdiscs, and finally all bpf programs attached - to root and clsact qdisc. - - **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] - Attach bpf program *PROG* to network interface *NAME* with - type specified by *ATTACH_TYPE*. Previously attached bpf program - can be replaced by the command used with **overwrite** option. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. - - *ATTACH_TYPE* can be of: - **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; - **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; - **xdpdrv** - Native XDP. runs earliest point in driver's receive path; - **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; - - **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* - Detach bpf program attached to network interface *NAME* with - type specified by *ATTACH_TYPE*. To detach bpf program, same - *ATTACH_TYPE* previously used for attach must be specified. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. - - **bpftool net help** - Print short help message. +bpftool net { show | list } [ dev *NAME* ] + List bpf program attachments in the kernel networking subsystem. + + Currently, device driver xdp attachments, tcx, netkit and old-style tc + classifier/action attachments, flow_dissector as well as netfilter + attachments are implemented, i.e., for program types **BPF_PROG_TYPE_XDP**, + **BPF_PROG_TYPE_SCHED_CLS**, **BPF_PROG_TYPE_SCHED_ACT**, + **BPF_PROG_TYPE_FLOW_DISSECTOR**, **BPF_PROG_TYPE_NETFILTER**. + + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, users + can use **bpftool cgroup** to dump cgroup attachments. For sk_{filter, skb, + msg, reuseport} and lwt/seg6 bpf programs, users should consult other + tools, e.g., iproute2. + + The current output will start with all xdp program attachments, followed by + all tcx, netkit, then tc class/qdisc bpf program attachments, then + flow_dissector and finally netfilter programs. Both xdp programs and + tcx/netkit/tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc**, the order + will be first all bpf programs attached to tcx, netkit, then tc classes, + then all bpf programs attached to non clsact qdiscs, and finally all bpf + programs attached to root and clsact qdisc. + +bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] + Attach bpf program *PROG* to network interface *NAME* with type specified + by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the + command used with **overwrite** option. Currently, only XDP-related modes + are supported for *ATTACH_TYPE*. + + *ATTACH_TYPE* can be of: + **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; + **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; + **xdpdrv** - Native XDP. runs earliest point in driver's receive path; + **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; + +bpftool net detach *ATTACH_TYPE* dev *NAME* + Detach bpf program attached to network interface *NAME* with type specified + by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used + for attach must be specified. Currently, only XDP-related modes are + supported for *ATTACH_TYPE*. + +bpftool net help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 5fea633a82f1..8c1ae55be596 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -14,37 +14,37 @@ tool for inspection of perf related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **perf** *COMMAND* +**bpftool** [*OPTIONS*] **perf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **help** } +*COMMANDS* := +{ **show** | **list** | **help** } PERF COMMANDS ============= -| **bpftool** **perf** { **show** | **list** } -| **bpftool** **perf help** +| **bpftool** **perf** { **show** | **list** } +| **bpftool** **perf help** DESCRIPTION =========== - **bpftool perf { show | list }** - List all raw_tracepoint, tracepoint, kprobe attachment in the system. +bpftool perf { show | list } + List all raw_tracepoint, tracepoint, kprobe attachment in the system. - Output will start with process id and file descriptor in that process, - followed by bpf program id, attachment information, and attachment point. - The attachment point for raw_tracepoint/tracepoint is the trace probe name. - The attachment point for k[ret]probe is either symbol name and offset, - or a kernel virtual address. - The attachment point for u[ret]probe is the file name and the file offset. + Output will start with process id and file descriptor in that process, + followed by bpf program id, attachment information, and attachment point. + The attachment point for raw_tracepoint/tracepoint is the trace probe name. + The attachment point for k[ret]probe is either symbol name and offset, or a + kernel virtual address. The attachment point for u[ret]probe is the file + name and the file offset. - **bpftool perf help** - Print short help message. +bpftool perf help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 58e6a5b10ef7..d6304e01afe0 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -14,250 +14,226 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **prog** *COMMAND* +**bpftool** [*OPTIONS*] **prog** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | - { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | - { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | +{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | +{ **-L** | **--use-loader** } } - *COMMANDS* := - { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | - **loadall** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | +**loadall** | **help** } PROG COMMANDS ============= -| **bpftool** **prog** { **show** | **list** } [*PROG*] -| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] -| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] -| **bpftool** **prog pin** *PROG* *FILE* -| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] -| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog tracelog** -| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] -| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* -| **bpftool** **prog help** +| **bpftool** **prog** { **show** | **list** } [*PROG*] +| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] +| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] +| **bpftool** **prog pin** *PROG* *FILE* +| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog tracelog** +| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* +| **bpftool** **prog help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *TYPE* := { -| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | -| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | -| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | -| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | -| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | -| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | -| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | -| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | -| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | -| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | -| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | -| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** -| } -| *ATTACH_TYPE* := { -| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | -| **sk_skb_stream_parser** | **flow_dissector** -| } -| *METRICs* := { -| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | -| **itlb_misses** | **dtlb_misses** -| } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *TYPE* := { +| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | +| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | +| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | +| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | +| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | +| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | +| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | +| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | +| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | +| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** +| } +| *ATTACH_TYPE* := { +| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | +| **sk_skb_stream_parser** | **flow_dissector** +| } +| *METRICs* := { +| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | +| **itlb_misses** | **dtlb_misses** +| } DESCRIPTION =========== - **bpftool prog { show | list }** [*PROG*] - Show information about loaded programs. If *PROG* is - specified show information only about given programs, - otherwise list all programs currently loaded on the system. - In case of **tag** or **name**, *PROG* may match several - programs which will all be shown. - - Output will start with program ID followed by program type and - zero or more named attributes (depending on kernel version). - - Since Linux 5.1 the kernel can collect statistics on BPF - programs (such as the total time spent running the program, - and the number of times it was run). If available, bpftool - shows such statistics. However, the kernel does not collect - them by defaults, as it slightly impacts performance on each - program run. Activation or deactivation of the feature is - performed via the **kernel.bpf_stats_enabled** sysctl knob. - - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - programs. On such kernels bpftool will automatically emit this - information as well. - - **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] - Dump eBPF instructions of the programs from the kernel. By - default, eBPF will be disassembled and printed to standard - output in human-readable format. In this case, **opcodes** - controls if raw opcodes should be printed as well. - - In case of **tag** or **name**, *PROG* may match several - programs which will all be dumped. However, if **file** or - **visual** is specified, *PROG* must match a single program. - - If **file** is specified, the binary image will instead be - written to *FILE*. - - If **visual** is specified, control flow graph (CFG) will be - built instead, and eBPF instructions will be presented with - CFG in DOT format, on standard output. - - If the programs have line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. - - **bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] - Dump jited image (host machine code) of the program. - - If *FILE* is specified image will be written to a file, - otherwise it will be disassembled and printed to stdout. - *PROG* must match a single program when **file** is specified. - - **opcodes** controls if raw opcodes will be printed. - - If the prog has line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. - - **bpftool prog pin** *PROG* *FILE* - Pin program *PROG* as *FILE*. - - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. - - **bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] - Load bpf program(s) from binary *OBJ* and pin as *PATH*. - **bpftool prog load** pins only the first program from the - *OBJ* as *PATH*. **bpftool prog loadall** pins all programs - from the *OBJ* under *PATH* directory. - **type** is optional, if not specified program type will be - inferred from section names. - By default bpftool will create new maps as declared in the ELF - object being loaded. **map** parameter allows for the reuse - of existing maps. It can be specified multiple times, each - time for a different map. *IDX* refers to index of the map - to be replaced in the ELF file counting from 0, while *NAME* - allows to replace a map by name. *MAP* specifies the map to - use, referring to it by **id** or through a **pinned** file. - If **offload_dev** *NAME* is specified program will be loaded - onto given networking device (offload). - If **xdpmeta_dev** *NAME* is specified program will become - device-bound without offloading, this facilitates access - to XDP metadata. - Optional **pinmaps** argument can be provided to pin all - maps under *MAP_DIR* directory. - - If **autoattach** is specified program will be attached - before pin. In that case, only the link (representing the - program attached to its hook) is pinned, not the program as - such, so the path won't show in **bpftool prog show -f**, - only show in **bpftool link show -f**. Also, this only works - when bpftool (libbpf) is able to infer all necessary - information from the object file, in particular, it's not - supported for all program types. If a program does not - support autoattach, bpftool falls back to regular pinning - for that program instead. - - Note: *PATH* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. - - **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] - Attach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - attached to current networking name space. - - **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] - Detach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - detached from the current networking name space. - - **bpftool prog tracelog** - Dump the trace pipe of the system to the console (stdout). - Hit <Ctrl+C> to stop printing. BPF programs can write to this - trace pipe at runtime with the **bpf_trace_printk**\ () helper. - This should be used only for debugging purposes. For - streaming data from BPF programs to user space, one can use - perf events (see also **bpftool-map**\ (8)). - - **bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] - Run BPF program *PROG* in the kernel testing infrastructure - for BPF, meaning that the program works on the data and - context provided by the user, and not on actual packets or - monitored functions etc. Return value and duration for the - test run are printed out to the console. - - Input data is read from the *FILE* passed with **data_in**. - If this *FILE* is "**-**", input data is read from standard - input. Input context, if any, is read from *FILE* passed with - **ctx_in**. Again, "**-**" can be used to read from standard - input, but only if standard input is not already in use for - input data. If a *FILE* is passed with **data_out**, output - data is written to that file. Similarly, output context is - written to the *FILE* passed with **ctx_out**. For both - output flows, "**-**" can be used to print to the standard - output (as plain text, or JSON if relevant option was - passed). If output keywords are omitted, output data and - context are discarded. Keywords **data_size_out** and - **ctx_size_out** are used to pass the size (in bytes) for the - output buffers to the kernel, although the default of 32 kB - should be more than enough for most cases. - - Keyword **repeat** is used to indicate the number of - consecutive runs to perform. Note that output data and - context printed to files correspond to the last of those - runs. The duration printed out at the end of the runs is an - average over all runs performed by the command. - - Not all program types support test run. Among those which do, - not all of them can take the **ctx_in**/**ctx_out** - arguments. bpftool does not perform checks on program types. - - **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* - Profile *METRICs* for bpf program *PROG* for *DURATION* - seconds or until user hits <Ctrl+C>. *DURATION* is optional. - If *DURATION* is not specified, the profiling will run up to - **UINT_MAX** seconds. - - **bpftool prog help** - Print short help message. +bpftool prog { show | list } [*PROG*] + Show information about loaded programs. If *PROG* is specified show + information only about given programs, otherwise list all programs + currently loaded on the system. In case of **tag** or **name**, *PROG* may + match several programs which will all be shown. + + Output will start with program ID followed by program type and zero or more + named attributes (depending on kernel version). + + Since Linux 5.1 the kernel can collect statistics on BPF programs (such as + the total time spent running the program, and the number of times it was + run). If available, bpftool shows such statistics. However, the kernel does + not collect them by defaults, as it slightly impacts performance on each + program run. Activation or deactivation of the feature is performed via the + **kernel.bpf_stats_enabled** sysctl knob. + + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF programs. On such kernels + bpftool will automatically emit this information as well. + +bpftool prog dump xlated *PROG* [{ file *FILE* | [opcodes] [linum] [visual] }] + Dump eBPF instructions of the programs from the kernel. By default, eBPF + will be disassembled and printed to standard output in human-readable + format. In this case, **opcodes** controls if raw opcodes should be printed + as well. + + In case of **tag** or **name**, *PROG* may match several programs which + will all be dumped. However, if **file** or **visual** is specified, + *PROG* must match a single program. + + If **file** is specified, the binary image will instead be written to + *FILE*. + + If **visual** is specified, control flow graph (CFG) will be built instead, + and eBPF instructions will be presented with CFG in DOT format, on standard + output. + + If the programs have line_info available, the source line will be + displayed. If **linum** is specified, the filename, line number and line + column will also be displayed. + +bpftool prog dump jited *PROG* [{ file *FILE* | [opcodes] [linum] }] + Dump jited image (host machine code) of the program. + + If *FILE* is specified image will be written to a file, otherwise it will + be disassembled and printed to stdout. *PROG* must match a single program + when **file** is specified. + + **opcodes** controls if raw opcodes will be printed. + + If the prog has line_info available, the source line will be displayed. If + **linum** is specified, the filename, line number and line column will also + be displayed. + +bpftool prog pin *PROG* *FILE* + Pin program *PROG* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + +bpftool prog { load | loadall } *OBJ* *PATH* [type *TYPE*] [map { idx *IDX* | name *NAME* } *MAP*] [{ offload_dev | xdpmeta_dev } *NAME*] [pinmaps *MAP_DIR*] [autoattach] + Load bpf program(s) from binary *OBJ* and pin as *PATH*. **bpftool prog + load** pins only the first program from the *OBJ* as *PATH*. **bpftool prog + loadall** pins all programs from the *OBJ* under *PATH* directory. **type** + is optional, if not specified program type will be inferred from section + names. By default bpftool will create new maps as declared in the ELF + object being loaded. **map** parameter allows for the reuse of existing + maps. It can be specified multiple times, each time for a different map. + *IDX* refers to index of the map to be replaced in the ELF file counting + from 0, while *NAME* allows to replace a map by name. *MAP* specifies the + map to use, referring to it by **id** or through a **pinned** file. If + **offload_dev** *NAME* is specified program will be loaded onto given + networking device (offload). If **xdpmeta_dev** *NAME* is specified program + will become device-bound without offloading, this facilitates access to XDP + metadata. Optional **pinmaps** argument can be provided to pin all maps + under *MAP_DIR* directory. + + If **autoattach** is specified program will be attached before pin. In that + case, only the link (representing the program attached to its hook) is + pinned, not the program as such, so the path won't show in **bpftool prog + show -f**, only show in **bpftool link show -f**. Also, this only works + when bpftool (libbpf) is able to infer all necessary information from the + object file, in particular, it's not supported for all program types. If a + program does not support autoattach, bpftool falls back to regular pinning + for that program instead. + + Note: *PATH* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + +bpftool prog attach *PROG* *ATTACH_TYPE* [*MAP*] + Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is attached to current networking name space. + +bpftool prog detach *PROG* *ATTACH_TYPE* [*MAP*] + Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is detached from the current networking name space. + +bpftool prog tracelog + Dump the trace pipe of the system to the console (stdout). Hit <Ctrl+C> to + stop printing. BPF programs can write to this trace pipe at runtime with + the **bpf_trace_printk**\ () helper. This should be used only for debugging + purposes. For streaming data from BPF programs to user space, one can use + perf events (see also **bpftool-map**\ (8)). + +bpftool prog run *PROG* data_in *FILE* [data_out *FILE* [data_size_out *L*]] [ctx_in *FILE* [ctx_out *FILE* [ctx_size_out *M*]]] [repeat *N*] + Run BPF program *PROG* in the kernel testing infrastructure for BPF, + meaning that the program works on the data and context provided by the + user, and not on actual packets or monitored functions etc. Return value + and duration for the test run are printed out to the console. + + Input data is read from the *FILE* passed with **data_in**. If this *FILE* + is "**-**", input data is read from standard input. Input context, if any, + is read from *FILE* passed with **ctx_in**. Again, "**-**" can be used to + read from standard input, but only if standard input is not already in use + for input data. If a *FILE* is passed with **data_out**, output data is + written to that file. Similarly, output context is written to the *FILE* + passed with **ctx_out**. For both output flows, "**-**" can be used to + print to the standard output (as plain text, or JSON if relevant option was + passed). If output keywords are omitted, output data and context are + discarded. Keywords **data_size_out** and **ctx_size_out** are used to pass + the size (in bytes) for the output buffers to the kernel, although the + default of 32 kB should be more than enough for most cases. + + Keyword **repeat** is used to indicate the number of consecutive runs to + perform. Note that output data and context printed to files correspond to + the last of those runs. The duration printed out at the end of the runs is + an average over all runs performed by the command. + + Not all program types support test run. Among those which do, not all of + them can take the **ctx_in**/**ctx_out** arguments. bpftool does not + perform checks on program types. + +bpftool prog profile *PROG* [duration *DURATION*] *METRICs* + Profile *METRICs* for bpf program *PROG* for *DURATION* seconds or until + user hits <Ctrl+C>. *DURATION* is optional. If *DURATION* is not specified, + the profiling will run up to **UINT_MAX** seconds. + +bpftool prog help + Print short help message. OPTIONS ======= - .. include:: common_options.rst - - -f, --bpffs - When showing BPF programs, show file names of pinned - programs. - - -m, --mapcompat - Allow loading maps with unknown map definitions. - - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. - - -L, --use-loader - Load program as a "loader" program. This is useful to debug - the generation of such programs. When this option is in - use, bpftool attempts to load the programs from the object - file into the kernel, but does not pin them (therefore, the - *PATH* must not be provided). - - When combined with the **-d**\ \|\ **--debug** option, - additional debug messages are generated, and the execution - of the loader program will use the **bpf_trace_printk**\ () - helper to log each step of loading BTF, creating the maps, - and loading the programs (see **bpftool prog tracelog** as - a way to dump those messages). +.. include:: common_options.rst + +-f, --bpffs + When showing BPF programs, show file names of pinned programs. + +-m, --mapcompat + Allow loading maps with unknown map definitions. + +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. + +-L, --use-loader + Load program as a "loader" program. This is useful to debug the generation + of such programs. When this option is in use, bpftool attempts to load the + programs from the object file into the kernel, but does not pin them + (therefore, the *PATH* must not be provided). + + When combined with the **-d**\ \|\ **--debug** option, additional debug + messages are generated, and the execution of the loader program will use + the **bpf_trace_printk**\ () helper to log each step of loading BTF, + creating the maps, and loading the programs (see **bpftool prog tracelog** + as a way to dump those messages). EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 8022b5321dbe..e871b9539ac7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -14,61 +14,60 @@ tool to register/unregister/introspect BPF struct_ops SYNOPSIS ======== - **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* +**bpftool** [*OPTIONS*] **struct_ops** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump** | **register** | **unregister** | **help** } STRUCT_OPS COMMANDS =================== -| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] -| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* -| **bpftool** **struct_ops help** +| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] +| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* +| **bpftool** **struct_ops help** | -| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } -| *OBJ* := /a/file/of/bpf_struct_ops.o +| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } +| *OBJ* := /a/file/of/bpf_struct_ops.o DESCRIPTION =========== - **bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] - Show brief information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it shows information only - for the given struct_ops. Otherwise, it lists all struct_ops - currently existing in the system. - - Output will start with struct_ops map ID, followed by its map - name and its struct_ops's kernel type. - - **bpftool struct_ops dump** [*STRUCT_OPS_MAP*] - Dump details information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it dumps information only - for the given struct_ops. Otherwise, it dumps all struct_ops - currently existing in the system. - - **bpftool struct_ops register** *OBJ* [*LINK_DIR*] - Register bpf struct_ops from *OBJ*. All struct_ops under - the ELF section ".struct_ops" and ".struct_ops.link" will - be registered to its kernel subsystem. For each - struct_ops in the ".struct_ops.link" section, a link - will be created. You can give *LINK_DIR* to provide a - directory path where these links will be pinned with the - same name as their corresponding map name. - - **bpftool struct_ops unregister** *STRUCT_OPS_MAP* - Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. - - **bpftool struct_ops help** - Print short help message. +bpftool struct_ops { show | list } [*STRUCT_OPS_MAP*] + Show brief information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it shows information only for the given + struct_ops. Otherwise, it lists all struct_ops currently existing in the + system. + + Output will start with struct_ops map ID, followed by its map name and its + struct_ops's kernel type. + +bpftool struct_ops dump [*STRUCT_OPS_MAP*] + Dump details information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it dumps information only for the given + struct_ops. Otherwise, it dumps all struct_ops currently existing in the + system. + +bpftool struct_ops register *OBJ* [*LINK_DIR*] + Register bpf struct_ops from *OBJ*. All struct_ops under the ELF section + ".struct_ops" and ".struct_ops.link" will be registered to its kernel + subsystem. For each struct_ops in the ".struct_ops.link" section, a link + will be created. You can give *LINK_DIR* to provide a directory path where + these links will be pinned with the same name as their corresponding map + name. + +bpftool struct_ops unregister *STRUCT_OPS_MAP* + Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. + +bpftool struct_ops help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 09e4f2ff5658..f38ae5c40439 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -14,57 +14,57 @@ tool for inspection and simple manipulation of eBPF programs and maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } +**bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } - **bpftool** **batch file** *FILE* +**bpftool** **batch file** *FILE* - **bpftool** **version** +**bpftool** **version** - *OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | - **btf** | **gen** | **struct_ops** | **iter** } +*OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | +**btf** | **gen** | **struct_ops** | **iter** } - *OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } +*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } - *MAP-COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **event_pipe** | **help** } +*MAP-COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **event_pipe** | **help** } - *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | - **load** | **attach** | **detach** | **help** } +*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | +**load** | **attach** | **detach** | **help** } - *LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } +*LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } - *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } +*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } - *PERF-COMMANDS* := { **show** | **list** | **help** } +*PERF-COMMANDS* := { **show** | **list** | **help** } - *NET-COMMANDS* := { **show** | **list** | **help** } +*NET-COMMANDS* := { **show** | **list** | **help** } - *FEATURE-COMMANDS* := { **probe** | **help** } +*FEATURE-COMMANDS* := { **probe** | **help** } - *BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } +*BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } - *GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } +*GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } - *STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } - *ITER-COMMANDS* := { **pin** | **help** } +*ITER-COMMANDS* := { **pin** | **help** } DESCRIPTION =========== - *bpftool* allows for inspection and simple modification of BPF objects - on the system. +*bpftool* allows for inspection and simple modification of BPF objects on the +system. - Note that format of the output of all tools is not guaranteed to be - stable and should not be depended upon. +Note that format of the output of all tools is not guaranteed to be stable and +should not be depended upon. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -m, --mapcompat - Allow loading maps with unknown map definitions. +-m, --mapcompat + Allow loading maps with unknown map definitions. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst index 30df7a707f02..9234b9dab768 100644 --- a/tools/bpf/bpftool/Documentation/common_options.rst +++ b/tools/bpf/bpftool/Documentation/common_options.rst @@ -1,25 +1,23 @@ .. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -h, --help - Print short help message (similar to **bpftool help**). + Print short help message (similar to **bpftool help**). -V, --version - Print bpftool's version number (similar to **bpftool version**), the - number of the libbpf version in use, and optional features that were - included when bpftool was compiled. Optional features include linking - against LLVM or libbfd to provide the disassembler for JIT-ted - programs (**bpftool prog dump jited**) and usage of BPF skeletons - (some features like **bpftool prog profile** or showing pids - associated to BPF objects may rely on it). + Print bpftool's version number (similar to **bpftool version**), the number + of the libbpf version in use, and optional features that were included when + bpftool was compiled. Optional features include linking against LLVM or + libbfd to provide the disassembler for JIT-ted programs (**bpftool prog + dump jited**) and usage of BPF skeletons (some features like **bpftool prog + profile** or showing pids associated to BPF objects may rely on it). -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. + Generate JSON output. For commands that cannot produce JSON, this option + has no effect. -p, --pretty - Generate human-readable JSON output. Implies **-j**. + Generate human-readable JSON output. Implies **-j**. -d, --debug - Print all logs available, even debug-level information. This includes - logs from libbpf as well as from the verifier, when attempting to - load programs. + Print all logs available, even debug-level information. This includes logs + from libbpf as well as from the verifier, when attempting to load programs. diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 6e4f7ce6bc01..04afe2ac2228 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -106,19 +106,19 @@ _bpftool_get_link_ids() _bpftool_get_obj_map_names() { - local obj + local obj maps obj=$1 - maps=$(objdump -j maps -t $obj 2>/dev/null | \ - command awk '/g . maps/ {print $NF}') + maps=$(objdump -j .maps -t $obj 2>/dev/null | \ + command awk '/g . .maps/ {print $NF}') COMPREPLY+=( $( compgen -W "$maps" -- "$cur" ) ) } _bpftool_get_obj_map_idxs() { - local obj + local obj nmaps obj=$1 @@ -136,7 +136,7 @@ _sysfs_get_netdevs() # Retrieve type of the map that we are operating on. _bpftool_map_guess_map_type() { - local keyword ref + local keyword idx ref="" for (( idx=3; idx < ${#words[@]}-1; idx++ )); do case "${words[$((idx-2))]}" in lookup|update) @@ -255,8 +255,9 @@ _bpftool_map_update_get_name() _bpftool() { - local cur prev words objword json=0 - _init_completion || return + local cur prev words cword comp_args + local json=0 + _init_completion -- "$@" || return # Deal with options if [[ ${words[cword]} == -* ]]; then @@ -293,7 +294,7 @@ _bpftool() esac # Remove all options so completions don't have to deal with them. - local i + local i pprev for (( i=1; i < ${#words[@]}; )); do if [[ ${words[i]::1} == - ]] && [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then @@ -307,7 +308,7 @@ _bpftool() prev=${words[cword - 1]} pprev=${words[cword - 2]} - local object=${words[1]} command=${words[2]} + local object=${words[1]} if [[ -z $object || $cword -eq 1 ]]; then case $cur in @@ -324,8 +325,12 @@ _bpftool() esac fi + local command=${words[2]} [[ $command == help ]] && return 0 + local MAP_TYPE='id pinned name' + local PROG_TYPE='id pinned tag name' + # Completion depends on object and command in use case $object in prog) @@ -346,8 +351,6 @@ _bpftool() ;; esac - local PROG_TYPE='id pinned tag name' - local MAP_TYPE='id pinned name' local METRIC_TYPE='cycles instructions l1d_loads llc_misses \ itlb_misses dtlb_misses' case $command in @@ -457,7 +460,7 @@ _bpftool() obj=${words[3]} if [[ ${words[-4]} == "map" ]]; then - COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) return 0 fi if [[ ${words[-3]} == "map" ]]; then @@ -541,20 +544,9 @@ _bpftool() COMPREPLY=( $( compgen -W "$METRIC_TYPE duration" -- "$cur" ) ) return 0 ;; - 6) - case $prev in - duration) - return 0 - ;; - *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) - return 0 - ;; - esac - return 0 - ;; *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + [[ $prev == duration ]] && return 0 + _bpftool_once_attr "$METRIC_TYPE" return 0 ;; esac @@ -612,7 +604,7 @@ _bpftool() return 0 ;; register) - _filedir + [[ $prev == $command ]] && _filedir return 0 ;; *) @@ -638,9 +630,12 @@ _bpftool() pinned) _filedir ;; - *) + map) _bpftool_one_of_list $MAP_TYPE ;; + *) + _bpftool_once_attr 'map' + ;; esac return 0 ;; @@ -652,7 +647,6 @@ _bpftool() esac ;; map) - local MAP_TYPE='id pinned name' case $command in show|list|dump|peek|pop|dequeue|freeze) case $prev in @@ -793,13 +787,11 @@ _bpftool() # map, depending on the type of the map to update. case "$(_bpftool_map_guess_map_type)" in array_of_maps|hash_of_maps) - local MAP_TYPE='id pinned name' COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ -- "$cur" ) ) return 0 ;; prog_array) - local PROG_TYPE='id pinned tag name' COMPREPLY+=( $( compgen -W "$PROG_TYPE" \ -- "$cur" ) ) return 0 @@ -821,7 +813,7 @@ _bpftool() esac _bpftool_once_attr 'key' - local UPDATE_FLAGS='any exist noexist' + local UPDATE_FLAGS='any exist noexist' idx for (( idx=3; idx < ${#words[@]}-1; idx++ )); do if [[ ${words[idx]} == 'value' ]]; then # 'value' is present, but is not the last @@ -893,7 +885,6 @@ _bpftool() esac ;; btf) - local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned name' case $command in dump) @@ -1033,7 +1024,6 @@ _bpftool() local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \ grep '^cgroup_')" local ATTACH_FLAGS='multi override' - local PROG_TYPE='id pinned tag name' # Check for $prev = $command first if [ $prev = $command ]; then _filedir @@ -1086,7 +1076,6 @@ _bpftool() esac ;; net) - local PROG_TYPE='id pinned tag name' local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' case $command in show|list) @@ -1193,14 +1182,14 @@ _bpftool() pin|detach) if [[ $prev == "$command" ]]; then COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) - else + elif [[ $pprev == "$command" ]]; then _filedir fi return 0 ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help pin detach show list' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index cc6e6aae2447..958e92acca8e 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -244,29 +244,101 @@ int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type) return fd; } -int mount_bpffs_for_pin(const char *name, bool is_dir) +int create_and_mount_bpffs_dir(const char *dir_name) { char err_str[ERR_MAX_LEN]; - char *file; - char *dir; + bool dir_exists; int err = 0; - if (is_dir && is_bpffs(name)) + if (is_bpffs(dir_name)) return err; - file = malloc(strlen(name) + 1); - if (!file) { + dir_exists = access(dir_name, F_OK) == 0; + + if (!dir_exists) { + char *temp_name; + char *parent_name; + + temp_name = strdup(dir_name); + if (!temp_name) { + p_err("mem alloc failed"); + return -1; + } + + parent_name = dirname(temp_name); + + if (is_bpffs(parent_name)) { + /* nothing to do if already mounted */ + free(temp_name); + return err; + } + + if (access(parent_name, F_OK) == -1) { + p_err("can't create dir '%s' to pin BPF object: parent dir '%s' doesn't exist", + dir_name, parent_name); + free(temp_name); + return -1; + } + + free(temp_name); + } + + if (block_mount) { + p_err("no BPF file system found, not mounting it due to --nomount option"); + return -1; + } + + if (!dir_exists) { + err = mkdir(dir_name, S_IRWXU); + if (err) { + p_err("failed to create dir '%s': %s", dir_name, strerror(errno)); + return err; + } + } + + err = mnt_fs(dir_name, "bpf", err_str, ERR_MAX_LEN); + if (err) { + err_str[ERR_MAX_LEN - 1] = '\0'; + p_err("can't mount BPF file system on given dir '%s': %s", + dir_name, err_str); + + if (!dir_exists) + rmdir(dir_name); + } + + return err; +} + +int mount_bpffs_for_file(const char *file_name) +{ + char err_str[ERR_MAX_LEN]; + char *temp_name; + char *dir; + int err = 0; + + if (access(file_name, F_OK) != -1) { + p_err("can't pin BPF object: path '%s' already exists", file_name); + return -1; + } + + temp_name = strdup(file_name); + if (!temp_name) { p_err("mem alloc failed"); return -1; } - strcpy(file, name); - dir = dirname(file); + dir = dirname(temp_name); if (is_bpffs(dir)) /* nothing to do if already mounted */ goto out_free; + if (access(dir, F_OK) == -1) { + p_err("can't pin BPF object: dir '%s' doesn't exist", dir); + err = -1; + goto out_free; + } + if (block_mount) { p_err("no BPF file system found, not mounting it due to --nomount option"); err = -1; @@ -276,12 +348,12 @@ int mount_bpffs_for_pin(const char *name, bool is_dir) err = mnt_fs(dir, "bpf", err_str, ERR_MAX_LEN); if (err) { err_str[ERR_MAX_LEN - 1] = '\0'; - p_err("can't mount BPF file system to pin the object (%s): %s", - name, err_str); + p_err("can't mount BPF file system to pin the object '%s': %s", + file_name, err_str); } out_free: - free(file); + free(temp_name); return err; } @@ -289,7 +361,7 @@ int do_pin_fd(int fd, const char *name) { int err; - err = mount_bpffs_for_pin(name, false); + err = mount_bpffs_for_file(name); if (err) return err; diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 708733b0ea06..c754a428c8c6 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -664,7 +664,8 @@ probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type, probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ") && + !grep(buf, "program of this type cannot use helper "); switch (get_vendor_id(ifindex)) { case 0x19ee: /* Netronome specific */ diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 786268f1a483..b3979ddc0189 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -386,7 +386,7 @@ static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name */ needs_typeof = btf_is_array(var) || btf_is_ptr_to_func_proto(btf, var); if (needs_typeof) - printf("typeof("); + printf("__typeof__("); err = btf_dump__emit_type_decl(d, var_type_id, &opts); if (err) @@ -1131,7 +1131,7 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) continue; codegen("\ \n\ - obj->struct_ops.%1$s = (typeof(obj->struct_ops.%1$s))\n\ + obj->struct_ops.%1$s = (__typeof__(obj->struct_ops.%1$s))\n\ bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ \n\ ", ident); diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c index 6b0e5202ca7a..5c39c2ed36a2 100644 --- a/tools/bpf/bpftool/iter.c +++ b/tools/bpf/bpftool/iter.c @@ -76,7 +76,7 @@ static int do_pin(int argc, char **argv) goto close_obj; } - err = mount_bpffs_for_pin(path, false); + err = mount_bpffs_for_file(path); if (err) goto close_link; diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index afde9d0c2ea1..5cd503b763d7 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -526,6 +526,10 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) show_link_ifindex_json(info->netkit.ifindex, json_wtr); show_link_attach_type_json(info->netkit.attach_type, json_wtr); break; + case BPF_LINK_TYPE_SOCKMAP: + jsonw_uint_field(json_wtr, "map_id", info->sockmap.map_id); + show_link_attach_type_json(info->sockmap.attach_type, json_wtr); + break; case BPF_LINK_TYPE_XDP: show_link_ifindex_json(info->xdp.ifindex, json_wtr); break; @@ -915,6 +919,11 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) show_link_ifindex_plain(info->netkit.ifindex); show_link_attach_type_plain(info->netkit.attach_type); break; + case BPF_LINK_TYPE_SOCKMAP: + printf("\n\t"); + printf("map_id %u ", info->sockmap.map_id); + show_link_attach_type_plain(info->sockmap.attach_type); + break; case BPF_LINK_TYPE_XDP: printf("\n\t"); show_link_ifindex_plain(info->xdp.ifindex); diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index b8bb08d10dec..9eb764fe4cc8 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -142,7 +142,8 @@ const char *get_fd_type_name(enum bpf_obj_type type); char *get_fdinfo(int fd, const char *key); int open_obj_pinned(const char *path, bool quiet); int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type); -int mount_bpffs_for_pin(const char *name, bool is_dir); +int mount_bpffs_for_file(const char *file_name); +int create_and_mount_bpffs_dir(const char *dir_name); int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(int *, char ***)); int do_pin_fd(int fd, const char *name); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9cb42a3366c0..1a501cf09e78 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1778,7 +1778,10 @@ offload_dev: goto err_close_obj; } - err = mount_bpffs_for_pin(pinfile, !first_prog_only); + if (first_prog_only) + err = mount_bpffs_for_file(pinfile); + else + err = create_and_mount_bpffs_dir(pinfile); if (err) goto err_close_obj; @@ -2078,7 +2081,7 @@ static int profile_parse_metrics(int argc, char **argv) NEXT_ARG(); } if (selected_cnt > MAX_NUM_PROFILE_METRICS) { - p_err("too many (%d) metrics, please specify no more than %d metrics at at time", + p_err("too many (%d) metrics, please specify no more than %d metrics at a time", selected_cnt, MAX_NUM_PROFILE_METRICS); return -1; } diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index d573f2640d8e..aa43dead249c 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -515,7 +515,7 @@ static int do_register(int argc, char **argv) if (argc == 1) linkdir = GET_ARG(); - if (linkdir && mount_bpffs_for_pin(linkdir, true)) { + if (linkdir && create_and_mount_bpffs_dir(linkdir)) { p_err("can't mount bpffs for pinning"); return -1; } diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 736bdeccdfe4..65aa8ce142e5 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -111,6 +111,24 @@ .off = 0, \ .imm = IMM }) +/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ + +#define BPF_MOVSX64_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_MOVSX32_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9585f5345353..d94a72593ead 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1135,6 +1135,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -3394,6 +3395,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -5022,7 +5027,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) @@ -5508,7 +5513,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) @@ -6720,6 +6725,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); @@ -6938,6 +6947,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle @@ -7120,6 +7131,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7152,7 +7164,7 @@ struct bpf_fib_lookup { /* output: MTU value */ __u16 mtu_result; - }; + } __attribute__((packed, aligned(2))); /* input: L3 device index for lookup * output: device index from FIB lookup */ @@ -7197,8 +7209,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { @@ -7285,6 +7308,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h index 47afae3895ec..11fc18988bc2 100644 --- a/tools/include/uapi/linux/ethtool.h +++ b/tools/include/uapi/linux/ethtool.h @@ -14,40 +14,140 @@ #ifndef _UAPI_LINUX_ETHTOOL_H #define _UAPI_LINUX_ETHTOOL_H -#include <linux/kernel.h> +#include <linux/const.h> #include <linux/types.h> #include <linux/if_ether.h> -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ +#ifndef __KERNEL__ +#include <limits.h> /* for INT_MAX */ +#endif + +/* All structures exposed to userland should be defined such that they + * have the same layout for 32-bit and 64-bit userland. + */ + +/* Note on reserved space. + * Reserved fields must not be accessed directly by user space because + * they may be replaced by a different field in the future. They must + * be initialized to zero before making the request, e.g. via memset + * of the entire structure or implicitly by not being set in a structure + * initializer. + */ /** - * struct ethtool_channels - configuring number of network channel - * @cmd: ETHTOOL_{G,S}CHANNELS - * @max_rx: Read only. Maximum number of receive channel the driver support. - * @max_tx: Read only. Maximum number of transmit channel the driver support. - * @max_other: Read only. Maximum number of other channel the driver support. - * @max_combined: Read only. Maximum number of combined channel the driver - * support. Set of queues RX, TX or other. - * @rx_count: Valid values are in the range 1 to the max_rx. - * @tx_count: Valid values are in the range 1 to the max_tx. - * @other_count: Valid values are in the range 1 to the max_other. - * @combined_count: Valid values are in the range 1 to the max_combined. + * struct ethtool_cmd - DEPRECATED, link control and status + * This structure is DEPRECATED, please use struct ethtool_link_settings. + * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET + * @supported: Bitmask of %SUPPORTED_* flags for the link modes, + * physical connectors and other link features for which the + * interface supports autonegotiation or auto-detection. + * Read-only. + * @advertising: Bitmask of %ADVERTISED_* flags for the link modes, + * physical connectors and other link features that are + * advertised through autonegotiation or enabled for + * auto-detection. + * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @transceiver: Historically used to distinguish different possible + * PHY types, but not in a consistent way. Deprecated. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @maxtxpkt: Historically used to report TX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @maxrxpkt: Historically used to report RX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @lp_advertising: Bitmask of %ADVERTISED_* flags for the link modes + * and other link features that the link partner advertised + * through autonegotiation; 0 if unknown or not applicable. + * Read-only. + * @reserved: Reserved for future use; see the note on reserved space. * - * This can be used to configure RX, TX and other channels. + * The link speed in Mbps is split between @speed and @speed_hi. Use + * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to + * access it. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GSET to get the current values before making specific + * changes and then applying them with %ETHTOOL_SSET. + * + * Deprecated fields should be ignored by both users and drivers. */ - -struct ethtool_channels { +struct ethtool_cmd { __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; + __u32 supported; + __u32 advertising; + __u16 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 transceiver; + __u8 autoneg; + __u8 mdio_support; + __u32 maxtxpkt; + __u32 maxrxpkt; + __u16 speed_hi; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __u32 lp_advertising; + __u32 reserved[2]; }; +static inline void ethtool_cmd_speed_set(struct ethtool_cmd *ep, + __u32 speed) +{ + ep->speed = (__u16)(speed & 0xFFFF); + ep->speed_hi = (__u16)(speed >> 16); +} + +static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) +{ + return (ep->speed_hi << 16) | ep->speed; +} + +/* Device supports clause 22 register access to PHY or peripherals + * using the interface defined in <linux/mii.h>. This should not be + * set if there are known to be no such peripherals present or if + * the driver only emulates clause 22 registers for compatibility. + */ +#define ETH_MDIO_SUPPORTS_C22 1 + +/* Device supports clause 45 register access to PHY or peripherals + * using the interface defined in <linux/mii.h> and <linux/mdio.h>. + * This should not be set if there are known to be no such peripherals + * present. + */ +#define ETH_MDIO_SUPPORTS_C45 2 + #define ETHTOOL_FWVERS_LEN 32 #define ETHTOOL_BUSINFO_LEN 32 #define ETHTOOL_EROMVERS_LEN 32 @@ -59,8 +159,10 @@ struct ethtool_channels { * in its bus driver structure (e.g. pci_driver::name). Must * not be an empty string. * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; may be an empty string - * @erom_version: Expansion ROM version string; may be an empty string + * @fw_version: Firmware version string; driver defined; may be an + * empty string + * @erom_version: Expansion ROM version string; driver defined; may be + * an empty string * @bus_info: Device bus address. This should match the dev_name() * string for the underlying bus device, if there is one. May be * an empty string. @@ -79,10 +181,6 @@ struct ethtool_channels { * * Users can use the %ETHTOOL_GSSET_INFO command to get the number of * strings in any string set (from Linux 2.6.34). - * - * Drivers should set at most @driver, @version, @fw_version and - * @bus_info in their get_drvinfo() implementation. The ethtool - * core fills in the other fields using other driver operations. */ struct ethtool_drvinfo { __u32 cmd; @@ -99,6 +197,2075 @@ struct ethtool_drvinfo { __u32 regdump_len; }; -#define ETHTOOL_GDRVINFO 0x00000003 +#define SOPASS_MAX 6 + +/** + * struct ethtool_wolinfo - Wake-On-Lan configuration + * @cmd: Command number = %ETHTOOL_GWOL or %ETHTOOL_SWOL + * @supported: Bitmask of %WAKE_* flags for supported Wake-On-Lan modes. + * Read-only. + * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. + * @sopass: SecureOn(tm) password; meaningful only if %WAKE_MAGICSECURE + * is set in @wolopts. + */ +struct ethtool_wolinfo { + __u32 cmd; + __u32 supported; + __u32 wolopts; + __u8 sopass[SOPASS_MAX]; +}; + +/* for passing single values */ +struct ethtool_value { + __u32 cmd; + __u32 data; +}; + +#define PFC_STORM_PREVENTION_AUTO 0xffff +#define PFC_STORM_PREVENTION_DISABLE 0 + +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, + ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ + ETHTOOL_TX_COPYBREAK_BUF_SIZE, + /* + * Add your fresh new tunable attribute above and remember to update + * tunable_strings[] in net/ethtool/common.c + */ + __ETHTOOL_TUNABLE_COUNT, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + __u32 cmd; + __u32 id; + __u32 type_id; + __u32 len; + void *data[]; +}; + +#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff +#define DOWNSHIFT_DEV_DISABLE 0 + +/* Time in msecs after which link is reported as down + * 0 = lowest time supported by the PHY + * 0xff = off, link down detection according to standard + */ +#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0 +#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff + +/* Energy Detect Power Down (EDPD) is a feature supported by some PHYs, where + * the PHY's RX & TX blocks are put into a low-power mode when there is no + * link detected (typically cable is un-plugged). For RX, only a minimal + * link-detection is available, and for TX the PHY wakes up to send link pulses + * to avoid any lock-ups in case the peer PHY may also be running in EDPD mode. + * + * Some PHYs may support configuration of the wake-up interval for TX pulses, + * and some PHYs may support only disabling TX pulses entirely. For the latter + * a special value is required (ETHTOOL_PHY_EDPD_NO_TX) so that this can be + * configured from userspace (should the user want it). + * + * The interval units for TX wake-up are in milliseconds, since this should + * cover a reasonable range of intervals: + * - from 1 millisecond, which does not sound like much of a power-saver + * - to ~65 seconds which is quite a lot to wait for a link to come up when + * plugging a cable + */ +#define ETHTOOL_PHY_EDPD_DFLT_TX_MSECS 0xffff +#define ETHTOOL_PHY_EDPD_NO_TX 0xfffe +#define ETHTOOL_PHY_EDPD_DISABLE 0 + +enum phy_tunable_id { + ETHTOOL_PHY_ID_UNSPEC, + ETHTOOL_PHY_DOWNSHIFT, + ETHTOOL_PHY_FAST_LINK_DOWN, + ETHTOOL_PHY_EDPD, + /* + * Add your fresh new phy tunable attribute above and remember to update + * phy_tunable_strings[] in net/ethtool/common.c + */ + __ETHTOOL_PHY_TUNABLE_COUNT, +}; + +/** + * struct ethtool_regs - hardware register dump + * @cmd: Command number = %ETHTOOL_GREGS + * @version: Dump format version. This is driver-specific and may + * distinguish different chips/revisions. Drivers must use new + * version numbers whenever the dump format changes in an + * incompatible way. + * @len: On entry, the real length of @data. On return, the number of + * bytes used. + * @data: Buffer for the register dump + * + * Users should use %ETHTOOL_GDRVINFO to find the maximum length of + * a register dump for the interface. They must allocate the buffer + * immediately following this structure. + */ +struct ethtool_regs { + __u32 cmd; + __u32 version; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_eeprom - EEPROM dump + * @cmd: Command number = %ETHTOOL_GEEPROM, %ETHTOOL_GMODULEEEPROM or + * %ETHTOOL_SEEPROM + * @magic: A 'magic cookie' value to guard against accidental changes. + * The value passed in to %ETHTOOL_SEEPROM must match the value + * returned by %ETHTOOL_GEEPROM for the same device. This is + * unused when @cmd is %ETHTOOL_GMODULEEEPROM. + * @offset: Offset within the EEPROM to begin reading/writing, in bytes + * @len: On entry, number of bytes to read/write. On successful + * return, number of bytes actually read/written. In case of + * error, this may indicate at what point the error occurred. + * @data: Buffer to read/write from + * + * Users may use %ETHTOOL_GDRVINFO or %ETHTOOL_GMODULEINFO to find + * the length of an on-board or module EEPROM, respectively. They + * must allocate the buffer immediately following this structure. + */ +struct ethtool_eeprom { + __u32 cmd; + __u32 magic; + __u32 offset; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_eee - Energy Efficient Ethernet information + * @cmd: ETHTOOL_{G,S}EEE + * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations + * for which there is EEE support. + * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations + * advertised as eee capable. + * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex + * combinations advertised by the link partner as eee capable. + * @eee_active: Result of the eee auto negotiation. + * @eee_enabled: EEE configured mode (enabled/disabled). + * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given + * that eee was negotiated. + * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting + * its tx lpi (after reaching 'idle' state). Effective only when eee + * was negotiated and tx_lpi_enabled was set. + * @reserved: Reserved for future use; see the note on reserved space. + */ +struct ethtool_eee { + __u32 cmd; + __u32 supported; + __u32 advertised; + __u32 lp_advertised; + __u32 eee_active; + __u32 eee_enabled; + __u32 tx_lpi_enabled; + __u32 tx_lpi_timer; + __u32 reserved[2]; +}; + +/** + * struct ethtool_modinfo - plugin module eeprom information + * @cmd: %ETHTOOL_GMODULEINFO + * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx + * @eeprom_len: Length of the eeprom + * @reserved: Reserved for future use; see the note on reserved space. + * + * This structure is used to return the information to + * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM. + * The type code indicates the eeprom data format + */ +struct ethtool_modinfo { + __u32 cmd; + __u32 type; + __u32 eeprom_len; + __u32 reserved[8]; +}; + +/** + * struct ethtool_coalesce - coalescing parameters for IRQs and stats updates + * @cmd: ETHTOOL_{G,S}COALESCE + * @rx_coalesce_usecs: How many usecs to delay an RX interrupt after + * a packet arrives. + * @rx_max_coalesced_frames: Maximum number of packets to receive + * before an RX interrupt. + * @rx_coalesce_usecs_irq: Same as @rx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @rx_max_coalesced_frames_irq: Same as @rx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @tx_coalesce_usecs: How many usecs to delay a TX interrupt after + * a packet is sent. + * @tx_max_coalesced_frames: Maximum number of packets to be sent + * before a TX interrupt. + * @tx_coalesce_usecs_irq: Same as @tx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @tx_max_coalesced_frames_irq: Same as @tx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @stats_block_coalesce_usecs: How many usecs to delay in-memory + * statistics block updates. Some drivers do not have an + * in-memory statistic block, and in such cases this value is + * ignored. This value must not be zero. + * @use_adaptive_rx_coalesce: Enable adaptive RX coalescing. + * @use_adaptive_tx_coalesce: Enable adaptive TX coalescing. + * @pkt_rate_low: Threshold for low packet rate (packets per second). + * @rx_coalesce_usecs_low: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is below @pkt_rate_low. + * @rx_max_coalesced_frames_low: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is below @pkt_rate_low. + * @tx_coalesce_usecs_low: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is below @pkt_rate_low. + * @tx_max_coalesced_frames_low: Maximum nuumber of packets to be sent before + * a TX interrupt, when the packet rate is below @pkt_rate_low. + * @pkt_rate_high: Threshold for high packet rate (packets per second). + * @rx_coalesce_usecs_high: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is above @pkt_rate_high. + * @rx_max_coalesced_frames_high: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is above @pkt_rate_high. + * @tx_coalesce_usecs_high: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is above @pkt_rate_high. + * @tx_max_coalesced_frames_high: Maximum number of packets to be sent before + * a TX interrupt, when the packet rate is above @pkt_rate_high. + * @rate_sample_interval: How often to do adaptive coalescing packet rate + * sampling, measured in seconds. Must not be zero. + * + * Each pair of (usecs, max_frames) fields specifies that interrupts + * should be coalesced until + * (usecs > 0 && time_since_first_completion >= usecs) || + * (max_frames > 0 && completed_frames >= max_frames) + * + * It is illegal to set both usecs and max_frames to zero as this + * would cause interrupts to never be generated. To disable + * coalescing, set usecs = 0 and max_frames = 1. + * + * Some implementations ignore the value of max_frames and use the + * condition time_since_first_completion >= usecs + * + * This is deprecated. Drivers for hardware that does not support + * counting completions should validate that max_frames == !rx_usecs. + * + * Adaptive RX/TX coalescing is an algorithm implemented by some + * drivers to improve latency under low packet rates and improve + * throughput under high packet rates. Some drivers only implement + * one of RX or TX adaptive coalescing. Anything not implemented by + * the driver causes these values to be silently ignored. + * + * When the packet rate is below @pkt_rate_high but above + * @pkt_rate_low (both measured in packets per second) the + * normal {rx,tx}_* coalescing parameters are used. + */ +struct ethtool_coalesce { + __u32 cmd; + __u32 rx_coalesce_usecs; + __u32 rx_max_coalesced_frames; + __u32 rx_coalesce_usecs_irq; + __u32 rx_max_coalesced_frames_irq; + __u32 tx_coalesce_usecs; + __u32 tx_max_coalesced_frames; + __u32 tx_coalesce_usecs_irq; + __u32 tx_max_coalesced_frames_irq; + __u32 stats_block_coalesce_usecs; + __u32 use_adaptive_rx_coalesce; + __u32 use_adaptive_tx_coalesce; + __u32 pkt_rate_low; + __u32 rx_coalesce_usecs_low; + __u32 rx_max_coalesced_frames_low; + __u32 tx_coalesce_usecs_low; + __u32 tx_max_coalesced_frames_low; + __u32 pkt_rate_high; + __u32 rx_coalesce_usecs_high; + __u32 rx_max_coalesced_frames_high; + __u32 tx_coalesce_usecs_high; + __u32 tx_max_coalesced_frames_high; + __u32 rate_sample_interval; +}; + +/** + * struct ethtool_ringparam - RX/TX ring parameters + * @cmd: Command number = %ETHTOOL_GRINGPARAM or %ETHTOOL_SRINGPARAM + * @rx_max_pending: Maximum supported number of pending entries per + * RX ring. Read-only. + * @rx_mini_max_pending: Maximum supported number of pending entries + * per RX mini ring. Read-only. + * @rx_jumbo_max_pending: Maximum supported number of pending entries + * per RX jumbo ring. Read-only. + * @tx_max_pending: Maximum supported number of pending entries per + * TX ring. Read-only. + * @rx_pending: Current maximum number of pending entries per RX ring + * @rx_mini_pending: Current maximum number of pending entries per RX + * mini ring + * @rx_jumbo_pending: Current maximum number of pending entries per RX + * jumbo ring + * @tx_pending: Current maximum supported number of pending entries + * per TX ring + * + * If the interface does not have separate RX mini and/or jumbo rings, + * @rx_mini_max_pending and/or @rx_jumbo_max_pending will be 0. + * + * There may also be driver-dependent minimum values for the number + * of entries per ring. + */ +struct ethtool_ringparam { + __u32 cmd; + __u32 rx_max_pending; + __u32 rx_mini_max_pending; + __u32 rx_jumbo_max_pending; + __u32 tx_max_pending; + __u32 rx_pending; + __u32 rx_mini_pending; + __u32 rx_jumbo_pending; + __u32 tx_pending; +}; + +/** + * struct ethtool_channels - configuring number of network channel + * @cmd: ETHTOOL_{G,S}CHANNELS + * @max_rx: Read only. Maximum number of receive channel the driver support. + * @max_tx: Read only. Maximum number of transmit channel the driver support. + * @max_other: Read only. Maximum number of other channel the driver support. + * @max_combined: Read only. Maximum number of combined channel the driver + * support. Set of queues RX, TX or other. + * @rx_count: Valid values are in the range 1 to the max_rx. + * @tx_count: Valid values are in the range 1 to the max_tx. + * @other_count: Valid values are in the range 1 to the max_other. + * @combined_count: Valid values are in the range 1 to the max_combined. + * + * This can be used to configure RX, TX and other channels. + */ + +struct ethtool_channels { + __u32 cmd; + __u32 max_rx; + __u32 max_tx; + __u32 max_other; + __u32 max_combined; + __u32 rx_count; + __u32 tx_count; + __u32 other_count; + __u32 combined_count; +}; + +/** + * struct ethtool_pauseparam - Ethernet pause (flow control) parameters + * @cmd: Command number = %ETHTOOL_GPAUSEPARAM or %ETHTOOL_SPAUSEPARAM + * @autoneg: Flag to enable autonegotiation of pause frame use + * @rx_pause: Flag to enable reception of pause frames + * @tx_pause: Flag to enable transmission of pause frames + * + * Drivers should reject a non-zero setting of @autoneg when + * autoneogotiation is disabled (or not supported) for the link. + * + * If the link is autonegotiated, drivers should use + * mii_advertise_flowctrl() or similar code to set the advertised + * pause frame capabilities based on the @rx_pause and @tx_pause flags, + * even if @autoneg is zero. They should also allow the advertised + * pause frame capabilities to be controlled directly through the + * advertising field of &struct ethtool_cmd. + * + * If @autoneg is non-zero, the MAC is configured to send and/or + * receive pause frames according to the result of autonegotiation. + * Otherwise, it is configured directly based on the @rx_pause and + * @tx_pause flags. + */ +struct ethtool_pauseparam { + __u32 cmd; + __u32 autoneg; + __u32 rx_pause; + __u32 tx_pause; +}; + +/* Link extended state */ +enum ethtool_link_ext_state { + ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_STATE_NO_CABLE, + ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, + ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, + ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, + ETHTOOL_LINK_EXT_STATE_OVERHEAT, + ETHTOOL_LINK_EXT_STATE_MODULE, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ +enum ethtool_link_ext_substate_autoneg { + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. + */ +enum ethtool_link_ext_substate_link_training { + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, + ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. + */ +enum ethtool_link_ext_substate_link_logical_mismatch { + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. + */ +enum ethtool_link_ext_substate_bad_signal_integrity { + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */ +enum ethtool_link_ext_substate_cable_issue { + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, + ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */ +enum ethtool_link_ext_substate_module { + ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1, +}; + +#define ETH_GSTRING_LEN 32 + +/** + * enum ethtool_stringset - string set ID + * @ETH_SS_TEST: Self-test result names, for use with %ETHTOOL_TEST + * @ETH_SS_STATS: Statistic names, for use with %ETHTOOL_GSTATS + * @ETH_SS_PRIV_FLAGS: Driver private flag names, for use with + * %ETHTOOL_GPFLAGS and %ETHTOOL_SPFLAGS + * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE; + * now deprecated + * @ETH_SS_FEATURES: Device feature names + * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names + * @ETH_SS_TUNABLES: tunable names + * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS + * @ETH_SS_PHY_TUNABLES: PHY tunable names + * @ETH_SS_LINK_MODES: link mode names + * @ETH_SS_MSG_CLASSES: debug message class names + * @ETH_SS_WOL_MODES: wake-on-lan modes + * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags + * @ETH_SS_TS_TX_TYPES: timestamping Tx types + * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters + * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types + * @ETH_SS_STATS_STD: standardized stats + * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics + * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics + * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics + * @ETH_SS_STATS_RMON: names of RMON statistics + * + * @ETH_SS_COUNT: number of defined string sets + */ +enum ethtool_stringset { + ETH_SS_TEST = 0, + ETH_SS_STATS, + ETH_SS_PRIV_FLAGS, + ETH_SS_NTUPLE_FILTERS, + ETH_SS_FEATURES, + ETH_SS_RSS_HASH_FUNCS, + ETH_SS_TUNABLES, + ETH_SS_PHY_STATS, + ETH_SS_PHY_TUNABLES, + ETH_SS_LINK_MODES, + ETH_SS_MSG_CLASSES, + ETH_SS_WOL_MODES, + ETH_SS_SOF_TIMESTAMPING, + ETH_SS_TS_TX_TYPES, + ETH_SS_TS_RX_FILTERS, + ETH_SS_UDP_TUNNEL_TYPES, + ETH_SS_STATS_STD, + ETH_SS_STATS_ETH_PHY, + ETH_SS_STATS_ETH_MAC, + ETH_SS_STATS_ETH_CTRL, + ETH_SS_STATS_RMON, + + /* add new constants above here */ + ETH_SS_COUNT +}; + +/** + * enum ethtool_mac_stats_src - source of ethtool MAC statistics + * @ETHTOOL_MAC_STATS_SRC_AGGREGATE: + * if device supports a MAC merge layer, this retrieves the aggregate + * statistics of the eMAC and pMAC. Otherwise, it retrieves just the + * statistics of the single (express) MAC. + * @ETHTOOL_MAC_STATS_SRC_EMAC: + * if device supports a MM layer, this retrieves the eMAC statistics. + * Otherwise, it retrieves the statistics of the single (express) MAC. + * @ETHTOOL_MAC_STATS_SRC_PMAC: + * if device supports a MM layer, this retrieves the pMAC statistics. + */ +enum ethtool_mac_stats_src { + ETHTOOL_MAC_STATS_SRC_AGGREGATE, + ETHTOOL_MAC_STATS_SRC_EMAC, + ETHTOOL_MAC_STATS_SRC_PMAC, +}; + +/** + * enum ethtool_module_power_mode_policy - plug-in module power mode policy + * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode. + * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host + * to high power mode when the first port using it is put administratively + * up and to low power mode when the last port using it is put + * administratively down. + */ +enum ethtool_module_power_mode_policy { + ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1, + ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO, +}; + +/** + * enum ethtool_module_power_mode - plug-in module power mode + * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode. + * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode. + */ +enum ethtool_module_power_mode { + ETHTOOL_MODULE_POWER_MODE_LOW = 1, + ETHTOOL_MODULE_POWER_MODE_HIGH, +}; + +/** + * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are + * unknown + * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled + * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled + */ +enum ethtool_podl_pse_admin_state { + ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1, + ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, + ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED, +}; + +/** + * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is + * asserted true when the PoDL PSE state diagram variable mr_pse_enable is + * false" + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is + * asserted true when either of the PSE state diagram variables + * pi_detecting or pi_classifying is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower” + * is asserted true when the PoDL PSE state diagram variable pi_powered is + * true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted + * true when the PoDL PSE state diagram variable pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true + * when the logical combination of the PoDL PSE state diagram variables + * pi_prebiased*!pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted + * true when the PoDL PSE state diagram variable overload_held is true." + */ +enum ethtool_podl_pse_pw_d_status { + ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1, + ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED, + ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING, + ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING, + ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP, + ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE, + ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR, +}; + +/** + * enum ethtool_mm_verify_status - status of MAC Merge Verify function + * @ETHTOOL_MM_VERIFY_STATUS_UNKNOWN: + * verification status is unknown + * @ETHTOOL_MM_VERIFY_STATUS_INITIAL: + * the 802.3 Verify State diagram is in the state INIT_VERIFICATION + * @ETHTOOL_MM_VERIFY_STATUS_VERIFYING: + * the Verify State diagram is in the state VERIFICATION_IDLE, + * SEND_VERIFY or WAIT_FOR_RESPONSE + * @ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: + * indicates that the Verify State diagram is in the state VERIFIED + * @ETHTOOL_MM_VERIFY_STATUS_FAILED: + * the Verify State diagram is in the state VERIFY_FAIL + * @ETHTOOL_MM_VERIFY_STATUS_DISABLED: + * verification of preemption operation is disabled + */ +enum ethtool_mm_verify_status { + ETHTOOL_MM_VERIFY_STATUS_UNKNOWN, + ETHTOOL_MM_VERIFY_STATUS_INITIAL, + ETHTOOL_MM_VERIFY_STATUS_VERIFYING, + ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED, + ETHTOOL_MM_VERIFY_STATUS_FAILED, + ETHTOOL_MM_VERIFY_STATUS_DISABLED, +}; + +/** + * struct ethtool_gstrings - string set for data tagging + * @cmd: Command number = %ETHTOOL_GSTRINGS + * @string_set: String set ID; one of &enum ethtool_stringset + * @len: On return, the number of strings in the string set + * @data: Buffer for strings. Each string is null-padded to a size of + * %ETH_GSTRING_LEN. + * + * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in + * the string set. They must allocate a buffer of the appropriate + * size immediately following this structure. + */ +struct ethtool_gstrings { + __u32 cmd; + __u32 string_set; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_sset_info - string set information + * @cmd: Command number = %ETHTOOL_GSSET_INFO + * @reserved: Reserved for future use; see the note on reserved space. + * @sset_mask: On entry, a bitmask of string sets to query, with bits + * numbered according to &enum ethtool_stringset. On return, a + * bitmask of those string sets queried that are supported. + * @data: Buffer for string set sizes. On return, this contains the + * size of each string set that was queried and supported, in + * order of ID. + * + * Example: The user passes in @sset_mask = 0x7 (sets 0, 1, 2) and on + * return @sset_mask == 0x6 (sets 1, 2). Then @data[0] contains the + * size of set 1 and @data[1] contains the size of set 2. + * + * Users must allocate a buffer of the appropriate size (4 * number of + * sets queried) immediately following this structure. + */ +struct ethtool_sset_info { + __u32 cmd; + __u32 reserved; + __u64 sset_mask; + __u32 data[]; +}; + +/** + * enum ethtool_test_flags - flags definition of ethtool_test + * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise + * only online tests. + * @ETH_TEST_FL_FAILED: Driver set this flag if test fails. + * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback + * test. + * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test + */ + +enum ethtool_test_flags { + ETH_TEST_FL_OFFLINE = (1 << 0), + ETH_TEST_FL_FAILED = (1 << 1), + ETH_TEST_FL_EXTERNAL_LB = (1 << 2), + ETH_TEST_FL_EXTERNAL_LB_DONE = (1 << 3), +}; + +/** + * struct ethtool_test - device self-test invocation + * @cmd: Command number = %ETHTOOL_TEST + * @flags: A bitmask of flags from &enum ethtool_test_flags. Some + * flags may be set by the user on entry; others may be set by + * the driver on return. + * @reserved: Reserved for future use; see the note on reserved space. + * @len: On return, the number of test results + * @data: Array of test results + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of test results that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of results) immediately + * following this structure. + */ +struct ethtool_test { + __u32 cmd; + __u32 flags; + __u32 reserved; + __u32 len; + __u64 data[]; +}; + +/** + * struct ethtool_stats - device-specific statistics + * @cmd: Command number = %ETHTOOL_GSTATS + * @n_stats: On return, the number of statistics + * @data: Array of statistics + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of statistics that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of statistics) + * immediately following this structure. + */ +struct ethtool_stats { + __u32 cmd; + __u32 n_stats; + __u64 data[]; +}; + +/** + * struct ethtool_perm_addr - permanent hardware address + * @cmd: Command number = %ETHTOOL_GPERMADDR + * @size: On entry, the size of the buffer. On return, the size of the + * address. The command fails if the buffer is too small. + * @data: Buffer for the address + * + * Users must allocate the buffer immediately following this structure. + * A buffer size of %MAX_ADDR_LEN should be sufficient for any address + * type. + */ +struct ethtool_perm_addr { + __u32 cmd; + __u32 size; + __u8 data[]; +}; +/* boolean flags controlling per-interface behavior characteristics. + * When reading, the flag indicates whether or not a certain behavior + * is enabled/present. When writing, the flag indicates whether + * or not the driver should turn on (set) or off (clear) a behavior. + * + * Some behaviors may read-only (unconditionally absent or present). + * If such is the case, return EINVAL in the set-flags operation if the + * flag differs from the read-only value. + */ +enum ethtool_flags { + ETH_FLAG_TXVLAN = (1 << 7), /* TX VLAN offload enabled */ + ETH_FLAG_RXVLAN = (1 << 8), /* RX VLAN offload enabled */ + ETH_FLAG_LRO = (1 << 15), /* LRO is enabled */ + ETH_FLAG_NTUPLE = (1 << 27), /* N-tuple filters enabled */ + ETH_FLAG_RXHASH = (1 << 28), +}; + +/* The following structures are for supporting RX network flow + * classification and RX n-tuple configuration. Note, all multibyte + * fields, e.g., ip4src, ip4dst, psrc, pdst, spi, etc. are expected to + * be in network byte order. + */ + +/** + * struct ethtool_tcpip4_spec - flow specification for TCP/IPv4 etc. + * @ip4src: Source host + * @ip4dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tos: Type-of-service + * + * This can be used to specify a TCP/IPv4, UDP/IPv4 or SCTP/IPv4 flow. + */ +struct ethtool_tcpip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be16 psrc; + __be16 pdst; + __u8 tos; +}; + +/** + * struct ethtool_ah_espip4_spec - flow specification for IPsec/IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @spi: Security parameters index + * @tos: Type-of-service + * + * This can be used to specify an IPsec transport or tunnel over IPv4. + */ +struct ethtool_ah_espip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be32 spi; + __u8 tos; +}; + +#define ETH_RX_NFC_IP4 1 + +/** + * struct ethtool_usrip4_spec - general flow specification for IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tos: Type-of-service + * @ip_ver: Value must be %ETH_RX_NFC_IP4; mask must be 0 + * @proto: Transport protocol number; mask must be 0 + */ +struct ethtool_usrip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be32 l4_4_bytes; + __u8 tos; + __u8 ip_ver; + __u8 proto; +}; + +/** + * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc. + * @ip6src: Source host + * @ip6dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tclass: Traffic Class + * + * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow. + */ +struct ethtool_tcpip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be16 psrc; + __be16 pdst; + __u8 tclass; +}; + +/** + * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @spi: Security parameters index + * @tclass: Traffic Class + * + * This can be used to specify an IPsec transport or tunnel over IPv6. + */ +struct ethtool_ah_espip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 spi; + __u8 tclass; +}; + +/** + * struct ethtool_usrip6_spec - general flow specification for IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tclass: Traffic Class + * @l4_proto: Transport protocol number (nexthdr after any Extension Headers) + */ +struct ethtool_usrip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 l4_4_bytes; + __u8 tclass; + __u8 l4_proto; +}; + +union ethtool_flow_union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethtool_tcpip6_spec tcp_ip6_spec; + struct ethtool_tcpip6_spec udp_ip6_spec; + struct ethtool_tcpip6_spec sctp_ip6_spec; + struct ethtool_ah_espip6_spec ah_ip6_spec; + struct ethtool_ah_espip6_spec esp_ip6_spec; + struct ethtool_usrip6_spec usr_ip6_spec; + struct ethhdr ether_spec; + __u8 hdata[52]; +}; + +/** + * struct ethtool_flow_ext - additional RX flow fields + * @h_dest: destination MAC address + * @vlan_etype: VLAN EtherType + * @vlan_tci: VLAN tag control information + * @data: user defined data + * @padding: Reserved for future use; see the note on reserved space. + * + * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT + * is set in &struct ethtool_rx_flow_spec @flow_type. + * @h_dest is valid if %FLOW_MAC_EXT is set. + */ +struct ethtool_flow_ext { + __u8 padding[2]; + unsigned char h_dest[ETH_ALEN]; + __be16 vlan_etype; + __be16 vlan_tci; + __be32 data[2]; +}; + +/** + * struct ethtool_rx_flow_spec - classification rule for RX flows + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow fields to match (dependent on @flow_type) + * @h_ext: Additional fields to match + * @m_u: Masks for flow field bits to be matched + * @m_ext: Masks for additional field bits to be matched + * Note, all additional fields must be ignored unless @flow_type + * includes the %FLOW_EXT or %FLOW_MAC_EXT flag + * (see &struct ethtool_flow_ext description). + * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC + * if packets should be discarded, or %RX_CLS_FLOW_WAKE if the + * packets should be used for Wake-on-LAN with %WAKE_FILTER + * @location: Location of rule in the table. Locations must be + * numbered such that a flow matching multiple rules will be + * classified according to the first (lowest numbered) rule. + */ +struct ethtool_rx_flow_spec { + __u32 flow_type; + union ethtool_flow_union h_u; + struct ethtool_flow_ext h_ext; + union ethtool_flow_union m_u; + struct ethtool_flow_ext m_ext; + __u64 ring_cookie; + __u32 location; +}; + +/* How rings are laid out when accessing virtual functions or + * offloaded queues is device specific. To allow users to do flow + * steering and specify these queues the ring cookie is partitioned + * into a 32bit queue index with an 8 bit virtual function id. + * This also leaves the 3bytes for further specifiers. It is possible + * future devices may support more than 256 virtual functions if + * devices start supporting PCIe w/ARI. However at the moment I + * do not know of any devices that support this so I do not reserve + * space for this at this time. If a future patch consumes the next + * byte it should be aware of this possibility. + */ +#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32 +static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie) +{ + return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; +} + +static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) +{ + return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> + ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; +} + +/** + * struct ethtool_rxnfc - command to get or set RX flow classification rules + * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH, + * %ETHTOOL_GRXRINGS, %ETHTOOL_GRXCLSRLCNT, %ETHTOOL_GRXCLSRULE, + * %ETHTOOL_GRXCLSRLALL, %ETHTOOL_SRXCLSRLDEL or %ETHTOOL_SRXCLSRLINS + * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW + * @data: Command-dependent value + * @fs: Flow classification rule + * @rss_context: RSS context to be affected + * @rule_cnt: Number of rules to be affected + * @rule_locs: Array of used rule locations + * + * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating + * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following + * structure fields must not be used, except that if @flow_type includes + * the %FLOW_RSS flag, then @rss_context determines which RSS context to + * act on. + * + * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues + * on return. + * + * For %ETHTOOL_GRXCLSRLCNT, @rule_cnt is set to the number of defined + * rules on return. If @data is non-zero on return then it is the + * size of the rule table, plus the flag %RX_CLS_LOC_SPECIAL if the + * driver supports any special location values. If that flag is not + * set in @data then special location values should not be used. + * + * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an + * existing rule on entry and @fs contains the rule on return; if + * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is + * filled with the RSS context ID associated with the rule. + * + * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the + * user buffer for @rule_locs on entry. On return, @data is the size + * of the rule table, @rule_cnt is the number of defined rules, and + * @rule_locs contains the locations of the defined rules. Drivers + * must use the second parameter to get_rxnfc() instead of @rule_locs. + * + * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. + * @fs.@location either specifies the location to use or is a special + * location value with %RX_CLS_LOC_SPECIAL flag set. On return, + * @fs.@location is the actual rule location. If @fs.@flow_type + * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to + * use for flow spreading traffic which matches this rule. The value + * from the rxfh indirection table will be added to @fs.@ring_cookie + * to choose which ring to deliver to. + * + * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an + * existing rule on entry. + * + * A driver supporting the special location values for + * %ETHTOOL_SRXCLSRLINS may add the rule at any suitable unused + * location, and may remove a rule at a later location (lower + * priority) that matches exactly the same set of flows. The special + * values are %RX_CLS_LOC_ANY, selecting any location; + * %RX_CLS_LOC_FIRST, selecting the first suitable location (maximum + * priority); and %RX_CLS_LOC_LAST, selecting the last suitable + * location (minimum priority). Additional special values may be + * defined in future and drivers must return -%EINVAL for any + * unrecognised value. + */ +struct ethtool_rxnfc { + __u32 cmd; + __u32 flow_type; + __u64 data; + struct ethtool_rx_flow_spec fs; + union { + __u32 rule_cnt; + __u32 rss_context; + }; + __u32 rule_locs[]; +}; + + +/** + * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection + * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR + * @size: On entry, the array size of the user buffer, which may be zero. + * On return from %ETHTOOL_GRXFHINDIR, the array size of the hardware + * indirection table. + * @ring_index: RX ring/queue index for each hash value + * + * For %ETHTOOL_GRXFHINDIR, a @size of zero means that only the size + * should be returned. For %ETHTOOL_SRXFHINDIR, a @size of zero means + * the table should be reset to default values. This last feature + * is not supported by the original implementations. + */ +struct ethtool_rxfh_indir { + __u32 cmd; + __u32 size; + __u32 ring_index[]; +}; + +/** + * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. + * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command + * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. + * @indir_size: On entry, the array size of the user buffer for the + * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), + * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, + * the array size of the hardware indirection table. + * @key_size: On entry, the array size of the user buffer for the hash key, + * which may be zero. On return from %ETHTOOL_GRSSH, the size of the + * hardware hash key. + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. + * @rsvd8: Reserved for future use; see the note on reserved space. + * @rsvd32: Reserved for future use; see the note on reserved space. + * @rss_config: RX ring/queue index for each hash value i.e., indirection table + * of @indir_size __u32 elements, followed by hash key of @key_size + * bytes. + * + * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the + * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of + * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested + * and a @indir_size of zero means the indir table should be reset to default + * values (if @rss_context == 0) or that the RSS context should be deleted. + * An hfunc of zero means that hash function setting is not requested. + */ +struct ethtool_rxfh { + __u32 cmd; + __u32 rss_context; + __u32 indir_size; + __u32 key_size; + __u8 hfunc; + __u8 input_xfrm; + __u8 rsvd8[2]; + __u32 rsvd32; + __u32 rss_config[]; +}; +#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff +#define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff + +/** + * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow field values to match (dependent on @flow_type) + * @m_u: Masks for flow field value bits to be ignored + * @vlan_tag: VLAN tag to match + * @vlan_tag_mask: Mask for VLAN tag bits to be ignored + * @data: Driver-dependent data to match + * @data_mask: Mask for driver-dependent data bits to be ignored + * @action: RX ring/queue index to deliver to (non-negative) or other action + * (negative, e.g. %ETHTOOL_RXNTUPLE_ACTION_DROP) + * + * For flow types %TCP_V4_FLOW, %UDP_V4_FLOW and %SCTP_V4_FLOW, where + * a field value and mask are both zero this is treated as if all mask + * bits are set i.e. the field is ignored. + */ +struct ethtool_rx_ntuple_flow_spec { + __u32 flow_type; + union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethhdr ether_spec; + __u8 hdata[72]; + } h_u, m_u; + + __u16 vlan_tag; + __u16 vlan_tag_mask; + __u64 data; + __u64 data_mask; + + __s32 action; +#define ETHTOOL_RXNTUPLE_ACTION_DROP (-1) /* drop packet */ +#define ETHTOOL_RXNTUPLE_ACTION_CLEAR (-2) /* clear filter */ +}; + +/** + * struct ethtool_rx_ntuple - command to set or clear RX flow filter + * @cmd: Command number - %ETHTOOL_SRXNTUPLE + * @fs: Flow filter specification + */ +struct ethtool_rx_ntuple { + __u32 cmd; + struct ethtool_rx_ntuple_flow_spec fs; +}; + +#define ETHTOOL_FLASH_MAX_FILENAME 128 +enum ethtool_flash_op_type { + ETHTOOL_FLASH_ALL_REGIONS = 0, +}; + +/* for passing firmware flashing related parameters */ +struct ethtool_flash { + __u32 cmd; + __u32 region; + char data[ETHTOOL_FLASH_MAX_FILENAME]; +}; + +/** + * struct ethtool_dump - used for retrieving, setting device dump + * @cmd: Command number - %ETHTOOL_GET_DUMP_FLAG, %ETHTOOL_GET_DUMP_DATA, or + * %ETHTOOL_SET_DUMP + * @version: FW version of the dump, filled in by driver + * @flag: driver dependent flag for dump setting, filled in by driver during + * get and filled in by ethtool for set operation. + * flag must be initialized by macro ETH_FW_DUMP_DISABLE value when + * firmware dump is disabled. + * @len: length of dump data, used as the length of the user buffer on entry to + * %ETHTOOL_GET_DUMP_DATA and this is returned as dump length by driver + * for %ETHTOOL_GET_DUMP_FLAG command + * @data: data collected for get dump data operation + */ +struct ethtool_dump { + __u32 cmd; + __u32 version; + __u32 flag; + __u32 len; + __u8 data[]; +}; + +#define ETH_FW_DUMP_DISABLE 0 + +/* for returning and changing feature sets */ + +/** + * struct ethtool_get_features_block - block with state of 32 features + * @available: mask of changeable features + * @requested: mask of features requested to be enabled if possible + * @active: mask of currently enabled features + * @never_changed: mask of features not changeable for any device + */ +struct ethtool_get_features_block { + __u32 available; + __u32 requested; + __u32 active; + __u32 never_changed; +}; + +/** + * struct ethtool_gfeatures - command to get state of device's features + * @cmd: command number = %ETHTOOL_GFEATURES + * @size: On entry, the number of elements in the features[] array; + * on return, the number of elements in features[] needed to hold + * all features + * @features: state of features + */ +struct ethtool_gfeatures { + __u32 cmd; + __u32 size; + struct ethtool_get_features_block features[]; +}; + +/** + * struct ethtool_set_features_block - block with request for 32 features + * @valid: mask of features to be changed + * @requested: values of features to be changed + */ +struct ethtool_set_features_block { + __u32 valid; + __u32 requested; +}; + +/** + * struct ethtool_sfeatures - command to request change in device's features + * @cmd: command number = %ETHTOOL_SFEATURES + * @size: array size of the features[] array + * @features: feature change masks + */ +struct ethtool_sfeatures { + __u32 cmd; + __u32 size; + struct ethtool_set_features_block features[]; +}; + +/** + * struct ethtool_ts_info - holds a device's timestamping and PHC association + * @cmd: command number = %ETHTOOL_GET_TS_INFO + * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags + * @phc_index: device index of the associated PHC, or -1 if there is none + * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values + * @tx_reserved: Reserved for future use; see the note on reserved space. + * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values + * @rx_reserved: Reserved for future use; see the note on reserved space. + * + * The bits in the 'tx_types' and 'rx_filters' fields correspond to + * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values, + * respectively. For example, if the device supports HWTSTAMP_TX_ON, + * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set. + * + * Drivers should only report the filters they actually support without + * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for + * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the + * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op. + */ +struct ethtool_ts_info { + __u32 cmd; + __u32 so_timestamping; + __s32 phc_index; + __u32 tx_types; + __u32 tx_reserved[3]; + __u32 rx_filters; + __u32 rx_reserved[3]; +}; + +/* + * %ETHTOOL_SFEATURES changes features present in features[].valid to the + * values of corresponding bits in features[].requested. Bits in .requested + * not set in .valid or not changeable are ignored. + * + * Returns %EINVAL when .valid contains undefined or never-changeable bits + * or size is not equal to required number of features words (32-bit blocks). + * Returns >= 0 if request was completed; bits set in the value mean: + * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not + * changeable (not present in %ETHTOOL_GFEATURES' features[].available) + * those bits were ignored. + * %ETHTOOL_F_WISH - some or all changes requested were recorded but the + * resulting state of bits masked by .valid is not equal to .requested. + * Probably there are other device-specific constraints on some features + * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered + * here as though ignored bits were cleared. + * %ETHTOOL_F_COMPAT - some or all changes requested were made by calling + * compatibility functions. Requested offload state cannot be properly + * managed by kernel. + * + * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of + * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands + * for ETH_SS_FEATURES string set. First entry in the table corresponds to least + * significant bit in features[0] fields. Empty strings mark undefined features. + */ +enum ethtool_sfeatures_retval_bits { + ETHTOOL_F_UNSUPPORTED__BIT, + ETHTOOL_F_WISH__BIT, + ETHTOOL_F_COMPAT__BIT, +}; + +#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT) +#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) +#define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT) + +#define MAX_NUM_QUEUE 4096 + +/** + * struct ethtool_per_queue_op - apply sub command to the queues in mask. + * @cmd: ETHTOOL_PERQUEUE + * @sub_command: the sub command which apply to each queues + * @queue_mask: Bitmap of the queues which sub command apply to + * @data: A complete command structure following for each of the queues addressed + */ +struct ethtool_per_queue_op { + __u32 cmd; + __u32 sub_command; + __u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; + char data[]; +}; + +/** + * struct ethtool_fecparam - Ethernet Forward Error Correction parameters + * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM + * @active_fec: FEC mode which is active on the port, single bit set, GET only. + * @fec: Bitmask of configured FEC modes. + * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. + * + * Note that @reserved was never validated on input and ethtool user space + * left it uninitialized when calling SET. Hence going forward it can only be + * used to return a value to userspace with GET. + * + * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. + * FEC settings are configured by link autonegotiation whenever it's enabled. + * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. + * + * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings. + * It is recommended that drivers only accept a single bit set in @fec. + * When multiple bits are set in @fec drivers may pick mode in an implementation + * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other + * FEC modes, because it's unclear whether in this case other modes constrain + * AUTO or are independent choices. + * Drivers must reject SET requests if they support none of the requested modes. + * + * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead + * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM. + * + * See enum ethtool_fec_config_bits for definition of valid bits for both + * @fec and @active_fec. + */ +struct ethtool_fecparam { + __u32 cmd; + /* bitmask of FEC modes */ + __u32 active_fec; + __u32 fec; + __u32 reserved; +}; + +/** + * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration + * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not + * be used together with other bits. GET only. + * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually + * based link mode and SFP parameters read from module's + * EEPROM. This bit does _not_ mean autonegotiation. + * @ETHTOOL_FEC_OFF_BIT: No FEC Mode + * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode + * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode + * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet + * Consortium) + */ +enum ethtool_fec_config_bits { + ETHTOOL_FEC_NONE_BIT, + ETHTOOL_FEC_AUTO_BIT, + ETHTOOL_FEC_OFF_BIT, + ETHTOOL_FEC_RS_BIT, + ETHTOOL_FEC_BASER_BIT, + ETHTOOL_FEC_LLRS_BIT, +}; + +#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) +#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) +#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) +#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) +#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) +#define ETHTOOL_FEC_LLRS (1 << ETHTOOL_FEC_LLRS_BIT) + +/* CMDs currently supported */ +#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. + * Please use ETHTOOL_GLINKSETTINGS + */ +#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings. + * Please use ETHTOOL_SLINKSETTINGS + */ +#define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */ +#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ +#define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ +#define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */ +#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ +#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ +#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ +/* Get link status for host, i.e. whether the interface *and* the + * physical port (if there is one) are up (ethtool_value). */ +#define ETHTOOL_GLINK 0x0000000a +#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ +#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ +#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ +#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ +#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ +#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */ +#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ +#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ +#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ +#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ +#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ +#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ +#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable + * (ethtool_value) */ +#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable + * (ethtool_value). */ +#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */ +#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ +#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ +#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ +#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ +#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ +#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ +#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ +#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ +#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ +#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ +#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */ +#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */ +#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ +#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ + +#define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ +#define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ +#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */ +#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */ +#define ETHTOOL_GRXRINGS 0x0000002d /* Get RX rings available for LB */ +#define ETHTOOL_GRXCLSRLCNT 0x0000002e /* Get RX class rule count */ +#define ETHTOOL_GRXCLSRULE 0x0000002f /* Get RX classification rule */ +#define ETHTOOL_GRXCLSRLALL 0x00000030 /* Get all RX classification rule */ +#define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */ +#define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */ +#define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */ +#define ETHTOOL_RESET 0x00000034 /* Reset hardware */ +#define ETHTOOL_SRXNTUPLE 0x00000035 /* Add an n-tuple filter to device */ +#define ETHTOOL_GRXNTUPLE 0x00000036 /* deprecated */ +#define ETHTOOL_GSSET_INFO 0x00000037 /* Get string set info */ +#define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */ +#define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */ + +#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */ +#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */ +#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ +#define ETHTOOL_SCHANNELS 0x0000003d /* Set no of channels */ +#define ETHTOOL_SET_DUMP 0x0000003e /* Set dump settings */ +#define ETHTOOL_GET_DUMP_FLAG 0x0000003f /* Get dump settings */ +#define ETHTOOL_GET_DUMP_DATA 0x00000040 /* Get dump data */ +#define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */ +#define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */ +#define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */ +#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */ +#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */ + +#define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ +#define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ +#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ + +#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ + +#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ +#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ +#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ +#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ +#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ +#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ + +/* compatibility with older code */ +#define SPARC_ETH_GSET ETHTOOL_GSET +#define SPARC_ETH_SSET ETHTOOL_SSET + +/* Link mode bit indices */ +enum ethtool_link_mode_bit_indices { + ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, + ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1, + ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2, + ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3, + ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5, + ETHTOOL_LINK_MODE_Autoneg_BIT = 6, + ETHTOOL_LINK_MODE_TP_BIT = 7, + ETHTOOL_LINK_MODE_AUI_BIT = 8, + ETHTOOL_LINK_MODE_MII_BIT = 9, + ETHTOOL_LINK_MODE_FIBRE_BIT = 10, + ETHTOOL_LINK_MODE_BNC_BIT = 11, + ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12, + ETHTOOL_LINK_MODE_Pause_BIT = 13, + ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14, + ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15, + ETHTOOL_LINK_MODE_Backplane_BIT = 16, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17, + ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19, + ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20, + ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21, + ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26, + ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27, + ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28, + ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, + ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31, + + /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit + * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* + * macro for bits > 31. The only way to use indices > 31 is to + * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. + */ + + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32, + ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34, + ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT = 36, + ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38, + ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, + ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41, + ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42, + ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43, + ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44, + ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45, + ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46, + ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, + ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, + + ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, + ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, + ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, + ETHTOOL_LINK_MODE_50000baseKR_Full_BIT = 52, + ETHTOOL_LINK_MODE_50000baseSR_Full_BIT = 53, + ETHTOOL_LINK_MODE_50000baseCR_Full_BIT = 54, + ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT = 55, + ETHTOOL_LINK_MODE_50000baseDR_Full_BIT = 56, + ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT = 57, + ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT = 58, + ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT = 59, + ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60, + ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT = 61, + ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT = 62, + ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT = 63, + ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64, + ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, + ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, + ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, + ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, + ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, + ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, + ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, + ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, + ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, + ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, + ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, + ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, + ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, + ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, + ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, + ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, + ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, + ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, + ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, + ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, + ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, + ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, + ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, + ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, + ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, + ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, + ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, + ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, + ETHTOOL_LINK_MODE_10baseT1S_Full_BIT = 99, + ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, + ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, + + /* must be last entry */ + __ETHTOOL_LINK_MODE_MASK_NBITS +}; + +#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ + (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT)) + +/* DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new SUPPORTED_* macro for bits > 31. + */ +#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new SUPPORTED_* macro for bits > 31, see + * notice above. + */ + +/* + * DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new ADERTISE_* macro for bits > 31. + */ +#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new ADVERTISED_* macro for bits > 31, see + * notice above. + */ + +/* The following are all involved in forcing a particular link + * mode for the device for setting things. When getting the + * devices settings, these indicate the current mode and whether + * it was forced up into this mode or autonegotiated. + */ + +/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. + * Update drivers/net/phy/phy.c:phy_speed_to_str() and + * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values. + */ +#define SPEED_10 10 +#define SPEED_100 100 +#define SPEED_1000 1000 +#define SPEED_2500 2500 +#define SPEED_5000 5000 +#define SPEED_10000 10000 +#define SPEED_14000 14000 +#define SPEED_20000 20000 +#define SPEED_25000 25000 +#define SPEED_40000 40000 +#define SPEED_50000 50000 +#define SPEED_56000 56000 +#define SPEED_100000 100000 +#define SPEED_200000 200000 +#define SPEED_400000 400000 +#define SPEED_800000 800000 + +#define SPEED_UNKNOWN -1 + +static inline int ethtool_validate_speed(__u32 speed) +{ + return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN; +} + +/* Duplex, half or full. */ +#define DUPLEX_HALF 0x00 +#define DUPLEX_FULL 0x01 +#define DUPLEX_UNKNOWN 0xff + +static inline int ethtool_validate_duplex(__u8 duplex) +{ + switch (duplex) { + case DUPLEX_HALF: + case DUPLEX_FULL: + case DUPLEX_UNKNOWN: + return 1; + } + + return 0; +} + +#define MASTER_SLAVE_CFG_UNSUPPORTED 0 +#define MASTER_SLAVE_CFG_UNKNOWN 1 +#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 +#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 +#define MASTER_SLAVE_CFG_MASTER_FORCE 4 +#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 +#define MASTER_SLAVE_STATE_UNSUPPORTED 0 +#define MASTER_SLAVE_STATE_UNKNOWN 1 +#define MASTER_SLAVE_STATE_MASTER 2 +#define MASTER_SLAVE_STATE_SLAVE 3 +#define MASTER_SLAVE_STATE_ERR 4 + +/* These are used to throttle the rate of data on the phy interface when the + * native speed of the interface is higher than the link speed. These should + * not be used for phy interfaces which natively support multiple speeds (e.g. + * MII or SGMII). + */ +/* No rate matching performed. */ +#define RATE_MATCH_NONE 0 +/* The phy sends pause frames to throttle the MAC. */ +#define RATE_MATCH_PAUSE 1 +/* The phy asserts CRS to prevent the MAC from transmitting. */ +#define RATE_MATCH_CRS 2 +/* The MAC is programmed with a sufficiently-large IPG. */ +#define RATE_MATCH_OPEN_LOOP 3 + +/* Which connector port. */ +#define PORT_TP 0x00 +#define PORT_AUI 0x01 +#define PORT_MII 0x02 +#define PORT_FIBRE 0x03 +#define PORT_BNC 0x04 +#define PORT_DA 0x05 +#define PORT_NONE 0xef +#define PORT_OTHER 0xff + +/* Which transceiver to use. */ +#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */ +#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */ +#define XCVR_DUMMY1 0x02 +#define XCVR_DUMMY2 0x03 +#define XCVR_DUMMY3 0x04 + +/* Enable or disable autonegotiation. */ +#define AUTONEG_DISABLE 0x00 +#define AUTONEG_ENABLE 0x01 + +/* MDI or MDI-X status/control - if MDI/MDI_X/AUTO is set then + * the driver is required to renegotiate link + */ +#define ETH_TP_MDI_INVALID 0x00 /* status: unknown; control: unsupported */ +#define ETH_TP_MDI 0x01 /* status: MDI; control: force MDI */ +#define ETH_TP_MDI_X 0x02 /* status: MDI-X; control: force MDI-X */ +#define ETH_TP_MDI_AUTO 0x03 /* control: auto-select */ + +/* Wake-On-Lan options. */ +#define WAKE_PHY (1 << 0) +#define WAKE_UCAST (1 << 1) +#define WAKE_MCAST (1 << 2) +#define WAKE_BCAST (1 << 3) +#define WAKE_ARP (1 << 4) +#define WAKE_MAGIC (1 << 5) +#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */ +#define WAKE_FILTER (1 << 7) + +#define WOL_MODE_COUNT 8 + +/* RSS hash function data + * XOR the corresponding source and destination fields of each specified + * protocol. Both copies of the XOR'ed fields are fed into the RSS and RXHASH + * calculation. Note that this XORing reduces the input set entropy and could + * be exploited to reduce the RSS queue spread. + */ +#define RXH_XFRM_SYM_XOR (1 << 0) +#define RXH_XFRM_NO_CHANGE 0xff + +/* L2-L4 network traffic flow types */ +#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ +#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ +#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ +#define AH_ESP_V4_FLOW 0x04 /* hash only */ +#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ +#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ +#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ +#define AH_ESP_V6_FLOW 0x08 /* hash only */ +#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ +#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ +#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ +#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ +#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ +#define IP_USER_FLOW IPV4_USER_FLOW +#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ +#define IPV4_FLOW 0x10 /* hash only */ +#define IPV6_FLOW 0x11 /* hash only */ +#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ + +/* Used for GTP-U IPv4 and IPv6. + * The format of GTP packets only includes + * elements such as TEID and GTP version. + * It is primarily intended for data communication of the UE. + */ +#define GTPU_V4_FLOW 0x13 /* hash only */ +#define GTPU_V6_FLOW 0x14 /* hash only */ + +/* Use for GTP-C IPv4 and v6. + * The format of these GTP packets does not include TEID. + * Primarily expected to be used for communication + * to create sessions for UE data communication, + * commonly referred to as CSR (Create Session Request). + */ +#define GTPC_V4_FLOW 0x15 /* hash only */ +#define GTPC_V6_FLOW 0x16 /* hash only */ + +/* Use for GTP-C IPv4 and v6. + * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. + * After session creation, it becomes this packet. + * This is mainly used for requests to realize UE handover. + */ +#define GTPC_TEID_V4_FLOW 0x17 /* hash only */ +#define GTPC_TEID_V6_FLOW 0x18 /* hash only */ + +/* Use for GTP-U and extended headers for the PSC (PDU Session Container). + * The format of these GTP packets includes TEID and QFI. + * In 5G communication using UPF (User Plane Function), + * data communication with this extended header is performed. + */ +#define GTPU_EH_V4_FLOW 0x19 /* hash only */ +#define GTPU_EH_V6_FLOW 0x1a /* hash only */ + +/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. + * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by + * UL/DL included in the PSC. + * There are differences in the data included based on Downlink/Uplink, + * and can be used to distinguish packets. + * The functions described so far are useful when you want to + * handle communication from the mobile network in UPF, PGW, etc. + */ +#define GTPU_UL_V4_FLOW 0x1b /* hash only */ +#define GTPU_UL_V6_FLOW 0x1c /* hash only */ +#define GTPU_DL_V4_FLOW 0x1d /* hash only */ +#define GTPU_DL_V6_FLOW 0x1e /* hash only */ + +/* Flag to enable additional fields in struct ethtool_rx_flow_spec */ +#define FLOW_EXT 0x80000000 +#define FLOW_MAC_EXT 0x40000000 +/* Flag to enable RSS spreading of traffic matching rule (nfc only) */ +#define FLOW_RSS 0x20000000 + +/* L3-L4 network traffic flow hash options */ +#define RXH_L2DA (1 << 1) +#define RXH_VLAN (1 << 2) +#define RXH_L3_PROTO (1 << 3) +#define RXH_IP_SRC (1 << 4) +#define RXH_IP_DST (1 << 5) +#define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ +#define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ +#define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ +#define RXH_DISCARD (1 << 31) + +#define RX_CLS_FLOW_DISC 0xffffffffffffffffULL +#define RX_CLS_FLOW_WAKE 0xfffffffffffffffeULL + +/* Special RX classification rule insert location values */ +#define RX_CLS_LOC_SPECIAL 0x80000000 /* flag */ +#define RX_CLS_LOC_ANY 0xffffffff +#define RX_CLS_LOC_FIRST 0xfffffffe +#define RX_CLS_LOC_LAST 0xfffffffd + +/* EEPROM Standards for plug in modules */ +#define ETH_MODULE_SFF_8079 0x1 +#define ETH_MODULE_SFF_8079_LEN 256 +#define ETH_MODULE_SFF_8472 0x2 +#define ETH_MODULE_SFF_8472_LEN 512 +#define ETH_MODULE_SFF_8636 0x3 +#define ETH_MODULE_SFF_8636_LEN 256 +#define ETH_MODULE_SFF_8436 0x4 +#define ETH_MODULE_SFF_8436_LEN 256 + +#define ETH_MODULE_SFF_8636_MAX_LEN 640 +#define ETH_MODULE_SFF_8436_MAX_LEN 640 + +/* Reset flags */ +/* The reset() operation must clear the flags for the components which + * were actually reset. On successful return, the flags indicate the + * components which were not reset, either because they do not exist + * in the hardware or because they cannot be reset independently. The + * driver must never reset any components that were not requested. + */ +enum ethtool_reset_flags { + /* These flags represent components dedicated to the interface + * the command is addressed to. Shift any flag left by + * ETH_RESET_SHARED_SHIFT to reset a shared component of the + * same type. + */ + ETH_RESET_MGMT = 1 << 0, /* Management processor */ + ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */ + ETH_RESET_DMA = 1 << 2, /* DMA engine */ + ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */ + ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */ + ETH_RESET_MAC = 1 << 5, /* Media access controller */ + ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ + ETH_RESET_RAM = 1 << 7, /* RAM shared between + * multiple components */ + ETH_RESET_AP = 1 << 8, /* Application processor */ + + ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to + * this interface */ + ETH_RESET_ALL = 0xffffffff, /* All components used by this + * interface, even if shared */ +}; +#define ETH_RESET_SHARED_SHIFT 16 + + +/** + * struct ethtool_link_settings - link control and status + * + * IMPORTANT, Backward compatibility notice: When implementing new + * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and + * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link + * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS + * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in + * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use + * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link + * settings; do not use %ETHTOOL_SLINKSETTINGS if + * %ETHTOOL_GLINKSETTINGS failed: stick to + * %ETHTOOL_GSET/%ETHTOOL_SSET in that case. + * + * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS + * @speed: Link speed (Mbps) + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @link_mode_masks_nwords: Number of 32-bit words for each of the + * supported, advertising, lp_advertising link mode bitmaps. For + * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user + * (>= 0); on return, if handshake in progress, negative if + * request size unsupported by kernel: absolute value indicates + * kernel expected size and all the other fields but cmd + * are 0; otherwise (handshake completed), strictly positive + * to indicate size used by kernel and cmd field stays + * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For + * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive + * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise + * refused. For drivers: ignore this field (use kernel's + * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will + * be overwritten by kernel. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. + * @master_slave_cfg: Master/slave port mode. + * @master_slave_state: Master/slave port state. + * @rate_matching: Rate adaptation performed by the PHY + * @reserved: Reserved for future use; see the note on reserved space. + * @link_mode_masks: Variable length bitmaps. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt + * are not available in %ethtool_link_settings. These fields will be + * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will + * fail if any of them is set to non-zero value. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GLINKSETTINGS to get the current values before making specific + * changes and then applying them with %ETHTOOL_SLINKSETTINGS. + * + * Drivers that implement %get_link_ksettings and/or + * %set_link_ksettings should ignore the @cmd + * and @link_mode_masks_nwords fields (any change to them overwritten + * by kernel), and rely only on kernel's internal + * %__ETHTOOL_LINK_MODE_MASK_NBITS and + * %ethtool_link_mode_mask_t. Drivers that implement + * %set_link_ksettings() should validate all fields other than @cmd + * and @link_mode_masks_nwords that are not described as read-only or + * deprecated, and must ignore all fields described as read-only. + * + * @link_mode_masks is divided into three bitfields, each of length + * @link_mode_masks_nwords: + * - supported: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features for which the interface + * supports autonegotiation or auto-detection. Read-only. + * - advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features that are advertised through + * autonegotiation or enabled for auto-detection. + * - lp_advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, and other + * link features that the link partner advertised through + * autonegotiation; 0 if unknown or not applicable. Read-only. + */ +struct ethtool_link_settings { + __u32 cmd; + __u32 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 autoneg; + __u8 mdio_support; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __s8 link_mode_masks_nwords; + __u8 transceiver; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 rate_matching; + __u32 reserved[7]; + __u32 link_mode_masks[]; + /* layout of link_mode_masks fields: + * __u32 map_supported[link_mode_masks_nwords]; + * __u32 map_advertising[link_mode_masks_nwords]; + * __u32 map_lp_advertising[link_mode_masks_nwords]; + */ +}; #endif /* _UAPI_LINUX_ETHTOOL_H */ diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 1ce738d91685..b5c7ce5c243a 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -2,7 +2,7 @@ #ifndef __BPF_CORE_READ_H__ #define __BPF_CORE_READ_H__ -#include <bpf/bpf_helpers.h> +#include "bpf_helpers.h" /* * enum bpf_field_info_kind is passed as a second argument into diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index cd17f6d0791f..62e1c0cc4a59 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -137,7 +137,8 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if __clang_major__ >= 8 && defined(__bpf__) +#if (defined(__clang__) && __clang_major__ >= 8) || (!defined(__clang__) && __GNUC__ > 12) +#if defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -165,6 +166,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) : "r0", "r1", "r2", "r3", "r4", "r5"); } #endif +#endif enum libbpf_pin_type { LIBBPF_PIN_NONE, diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 4d9f30bf7f01..5dbca76b953f 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1929,6 +1929,7 @@ static int btf_dump_int_data(struct btf_dump *d, if (d->typed_dump->is_array_terminated) break; if (*(char *)data == '\0') { + btf_dump_type_values(d, "'\\0'"); d->typed_dump->is_array_terminated = true; break; } @@ -2031,6 +2032,7 @@ static int btf_dump_array_data(struct btf_dump *d, __u32 i, elem_type_id; __s64 elem_size; bool is_array_member; + bool is_array_terminated; elem_type_id = array->type; elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); @@ -2066,12 +2068,15 @@ static int btf_dump_array_data(struct btf_dump *d, */ is_array_member = d->typed_dump->is_array_member; d->typed_dump->is_array_member = true; + is_array_terminated = d->typed_dump->is_array_terminated; + d->typed_dump->is_array_terminated = false; for (i = 0; i < array->nelems; i++, data += elem_size) { if (d->typed_dump->is_array_terminated) break; btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); } d->typed_dump->is_array_member = is_array_member; + d->typed_dump->is_array_terminated = is_array_terminated; d->typed_dump->depth--; btf_dump_data_pfx(d); btf_dump_type_values(d, "]"); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d7d8f78f8846..97eb6e5dd7c8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -149,6 +149,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_TCX] = "tcx", [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", [BPF_LINK_TYPE_NETKIT] = "netkit", + [BPF_LINK_TYPE_SOCKMAP] = "sockmap", }; static const char * const map_type_name[] = { @@ -1970,6 +1971,20 @@ static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, return NULL; } +static struct extern_desc *find_extern_by_name_with_len(const struct bpf_object *obj, + const void *name, int len) +{ + const char *ext_name; + int i; + + for (i = 0; i < obj->nr_extern; i++) { + ext_name = obj->externs[i].name; + if (strlen(ext_name) == len && strncmp(ext_name, name, len) == 0) + return &obj->externs[i]; + } + return NULL; +} + static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, char value) { @@ -7986,7 +8001,10 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) return 0; } -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +static int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) { char sym_type, sym_name[500]; unsigned long long sym_addr; @@ -8026,8 +8044,13 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, struct bpf_object *obj = ctx; const struct btf_type *t; struct extern_desc *ext; + char *res; - ext = find_extern_by_name(obj, sym_name); + res = strstr(sym_name, ".llvm."); + if (sym_type == 'd' && res) + ext = find_extern_by_name_with_len(obj, sym_name, res - sym_name); + else + ext = find_extern_by_name(obj, sym_name); if (!ext || ext->type != EXT_KSYM) return 0; @@ -12511,6 +12534,12 @@ bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) return bpf_program_attach_fd(prog, netns_fd, "netns", NULL); } +struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd) +{ + return bpf_program_attach_fd(prog, map_fd, "sockmap", NULL); +} + struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) { /* target_fd/target_ifindex use the same field in LINK_CREATE */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f88ab50c0229..1333ae20ebe6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -795,6 +795,8 @@ bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); LIBBPF_API struct bpf_link * bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); LIBBPF_API struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd); +LIBBPF_API struct bpf_link * bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, @@ -1293,6 +1295,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx); LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__consume_n(struct ring_buffer *rb, size_t n); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); /** @@ -1367,6 +1370,17 @@ LIBBPF_API int ring__map_fd(const struct ring *r); */ LIBBPF_API int ring__consume(struct ring *r); +/** + * @brief **ring__consume_n()** consumes up to a requested amount of items from + * a ringbuffer without event polling. + * + * @param r A ringbuffer object. + * @param n Maximum amount of items to consume. + * @return The number of items consumed, or a negative number if any of the + * callbacks return an error. + */ +LIBBPF_API int ring__consume_n(struct ring *r, size_t n); + struct user_ring_buffer_opts { size_t sz; /* size of this struct, for forward/backward compatibility */ }; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 51732ecb1385..c1ce8aa3520b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -416,3 +416,10 @@ LIBBPF_1.4.0 { btf__new_split; btf_ext__raw_data; } LIBBPF_1.3.0; + +LIBBPF_1.5.0 { + global: + bpf_program__attach_sockmap; + ring__consume_n; + ring_buffer__consume_n; +} LIBBPF_1.4.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 864b36177424..a0dcfb82e455 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -518,11 +518,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx); - -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); - /* handle direct returned errors */ static inline int libbpf_err(int ret) { diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 302188122439..9dfbe7750f56 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -448,7 +448,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) * at all, it will emit something like "invalid func unknown#181". * If BPF verifier recognizes BPF helper but it's not supported for - * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166" + * or "program of this type cannot use helper bpf_sys_bpf#166". * In both cases, provided combination of BPF program type and BPF * helper is not supported by the kernel. * In all other cases, probe_prog_load() above will either succeed (e.g., @@ -457,7 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe * that), or we'll get some more specific BPF verifier error about * some unsatisfied conditions. */ - if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") || + strstr(buf, "program of this type cannot use helper "))) return 0; return 1; /* assume supported */ } diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index e783a47da815..d6e5eff967cb 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 4 +#define LIBBPF_MINOR_VERSION 5 #endif /* __LIBBPF_VERSION_H */ diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index aacb64278a01..99e44cf02321 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -231,7 +231,7 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } -static int64_t ringbuf_process_ring(struct ring *r) +static int64_t ringbuf_process_ring(struct ring *r, size_t n) { int *len_ptr, len, err; /* 64-bit to avoid overflow in case of extreme application behavior */ @@ -268,12 +268,42 @@ static int64_t ringbuf_process_ring(struct ring *r) } smp_store_release(r->consumer_pos, cons_pos); + + if (cnt >= n) + goto done; } } while (got_new_data); done: return cnt; } +/* Consume available ring buffer(s) data without event polling, up to n + * records. + * + * Returns number of records consumed across all registered ring buffers (or + * n, whichever is less), or negative number if any of the callbacks return + * error. + */ +int ring_buffer__consume_n(struct ring_buffer *rb, size_t n) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = rb->rings[i]; + + err = ringbuf_process_ring(ring, n); + if (err < 0) + return libbpf_err(err); + res += err; + n -= err; + + if (n == 0) + break; + } + return res; +} + /* Consume available ring buffer(s) data without event polling. * Returns number of records consumed across all registered ring buffers (or * INT_MAX, whichever is less), or negative number if any of the callbacks @@ -287,13 +317,15 @@ int ring_buffer__consume(struct ring_buffer *rb) for (i = 0; i < rb->ring_cnt; i++) { struct ring *ring = rb->rings[i]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; + if (res > INT_MAX) { + res = INT_MAX; + break; + } } - if (res > INT_MAX) - return INT_MAX; return res; } @@ -314,13 +346,13 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) __u32 ring_id = rb->events[i].data.fd; struct ring *ring = rb->rings[ring_id]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; } if (res > INT_MAX) - return INT_MAX; + res = INT_MAX; return res; } @@ -371,17 +403,22 @@ int ring__map_fd(const struct ring *r) return r->map_fd; } -int ring__consume(struct ring *r) +int ring__consume_n(struct ring *r, size_t n) { - int64_t res; + int res; - res = ringbuf_process_ring(r); + res = ringbuf_process_ring(r, n); if (res < 0) return libbpf_err(res); return res > INT_MAX ? INT_MAX : res; } +int ring__consume(struct ring *r) +{ + return ring__consume_n(r, INT_MAX); +} + static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) { if (rb->consumer_pos) { diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index d8ade15e2789..cf657fc35619 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,5 +10,4 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) -verifier_arena # JIT does not support arena -arena_htab # JIT does not support arena +arena_atomics diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index f4a2f66a683d..c34adf39eeb2 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -6,3 +6,4 @@ stacktrace_build_id # compare_map_keys stackid_hmap vs. sta verifier_iterating_callbacks verifier_arena # JIT does not support arena arena_htab # JIT does not support arena +arena_atomics diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f06c527eee34..82247aeef857 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -278,11 +278,12 @@ UNPRIV_HELPERS := $(OUTPUT)/unpriv_helpers.o TRACE_HELPERS := $(OUTPUT)/trace_helpers.o JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o +NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(NETWORK_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) @@ -443,7 +444,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_map_key.c + test_ringbuf_n.c test_ringbuf_map_key.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ @@ -646,7 +647,7 @@ $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) # Define test_progs-cpuv4 test runner. ifneq ($(CLANG_CPUV4),) TRUNNER_BPF_BUILD_RULE := CLANG_CPUV4_BPF_BUILD_RULE -TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS $(eval $(call DEFINE_TEST_RUNNER,test_progs,cpuv4)) endif @@ -683,7 +684,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Include find_bit.c to compile xskxceiver. EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c -$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) +$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ @@ -717,6 +718,7 @@ $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tas $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.skel.h $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h +$(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -736,6 +738,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_hashmap_lookup.o \ $(OUTPUT)/bench_local_storage_create.o \ $(OUTPUT)/bench_htab_mem.o \ + $(OUTPUT)/bench_bpf_crypto.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ @@ -747,7 +750,7 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o $(OUTPUT)/uprobe_multi: uprobe_multi.c $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index b2b4c391eb0a..627b74ae041b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -280,6 +280,8 @@ extern struct argp bench_strncmp_argp; extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; +extern struct argp bench_trigger_batch_argp; +extern struct argp bench_crypto_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -292,6 +294,8 @@ static const struct argp_child bench_parsers[] = { { &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 }, { &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 }, { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, + { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, + { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, {}, }; @@ -491,24 +495,31 @@ extern const struct bench bench_rename_kretprobe; extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; -extern const struct bench bench_trig_base; -extern const struct bench bench_trig_tp; -extern const struct bench bench_trig_rawtp; + +/* pure counting benchmarks to establish theoretical lmits */ +extern const struct bench bench_trig_usermode_count; +extern const struct bench bench_trig_syscall_count; +extern const struct bench bench_trig_kernel_count; + +/* batched, staying mostly in-kernel benchmarks */ extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; -extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; -extern const struct bench bench_trig_uprobe_base; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; + +/* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; extern const struct bench bench_trig_uretprobe_nop; extern const struct bench bench_trig_uprobe_push; extern const struct bench bench_trig_uretprobe_push; extern const struct bench bench_trig_uprobe_ret; extern const struct bench bench_trig_uretprobe_ret; + extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; extern const struct bench bench_pb_libbpf; @@ -529,6 +540,8 @@ extern const struct bench bench_local_storage_tasks_trace; extern const struct bench bench_bpf_hashmap_lookup; extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; +extern const struct bench bench_crypto_encrypt; +extern const struct bench bench_crypto_decrypt; static const struct bench *benchs[] = { &bench_count_global, @@ -539,24 +552,28 @@ static const struct bench *benchs[] = { &bench_rename_rawtp, &bench_rename_fentry, &bench_rename_fexit, - &bench_trig_base, - &bench_trig_tp, - &bench_trig_rawtp, + /* pure counting benchmarks for establishing theoretical limits */ + &bench_trig_usermode_count, + &bench_trig_kernel_count, + &bench_trig_syscall_count, + /* batched, staying mostly in-kernel triggers */ &bench_trig_kprobe, &bench_trig_kretprobe, &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, &bench_trig_fexit, - &bench_trig_fentry_sleep, &bench_trig_fmodret, - &bench_trig_uprobe_base, + &bench_trig_tp, + &bench_trig_rawtp, + /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, &bench_trig_uprobe_push, &bench_trig_uretprobe_push, &bench_trig_uprobe_ret, &bench_trig_uretprobe_ret, + /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, &bench_pb_libbpf, @@ -577,6 +594,8 @@ static const struct bench *benchs[] = { &bench_bpf_hashmap_lookup, &bench_local_storage_create, &bench_htab_mem, + &bench_crypto_encrypt, + &bench_crypto_decrypt, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c new file mode 100644 index 000000000000..2845edaba8db --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <argp.h> +#include "bench.h" +#include "crypto_bench.skel.h" + +#define MAX_CIPHER_LEN 32 +static char *input; +static struct crypto_ctx { + struct crypto_bench *skel; + int pfd; +} ctx; + +static struct crypto_args { + u32 crypto_len; + char *crypto_cipher; +} args = { + .crypto_len = 16, + .crypto_cipher = "ecb(aes)", +}; + +enum { + ARG_CRYPTO_LEN = 5000, + ARG_CRYPTO_CIPHER = 5001, +}; + +static const struct argp_option opts[] = { + { "crypto-len", ARG_CRYPTO_LEN, "CRYPTO_LEN", 0, + "Set the length of crypto buffer" }, + { "crypto-cipher", ARG_CRYPTO_CIPHER, "CRYPTO_CIPHER", 0, + "Set the cipher to use (default:ecb(aes))" }, + {}, +}; + +static error_t crypto_parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_CRYPTO_LEN: + args.crypto_len = strtoul(arg, NULL, 10); + if (!args.crypto_len || + args.crypto_len > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "Invalid crypto buffer len (limit %zu)\n", + sizeof(ctx.skel->bss->dst)); + argp_usage(state); + } + break; + case ARG_CRYPTO_CIPHER: + args.crypto_cipher = strdup(arg); + if (!strlen(args.crypto_cipher) || + strlen(args.crypto_cipher) > MAX_CIPHER_LEN) { + fprintf(stderr, "Invalid crypto cipher len (limit %d)\n", + MAX_CIPHER_LEN); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_crypto_argp = { + .options = opts, + .parser = crypto_parse_arg, +}; + +static void crypto_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "bpf crypto benchmark doesn't support consumer!\n"); + exit(1); + } +} + +static void crypto_setup(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + + int err, pfd; + size_t i, sz; + + sz = args.crypto_len; + if (!sz || sz > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "invalid encrypt buffer size (source %zu, target %zu)\n", + sz, sizeof(ctx.skel->bss->dst)); + exit(1); + } + + setup_libbpf(); + + ctx.skel = crypto_bench__open(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + snprintf(ctx.skel->bss->cipher, 128, "%s", args.crypto_cipher); + memcpy(ctx.skel->bss->key, "12345678testtest", 16); + ctx.skel->bss->key_len = 16; + ctx.skel->bss->authsize = 0; + + srandom(time(NULL)); + input = malloc(sz); + for (i = 0; i < sz - 1; i++) + input[i] = '1' + random() % 9; + input[sz - 1] = '\0'; + + ctx.skel->rodata->len = args.crypto_len; + + err = crypto_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + pfd = bpf_program__fd(ctx.skel->progs.crypto_setup); + if (pfd < 0) { + fprintf(stderr, "failed to get fd for setup prog\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + err = bpf_prog_test_run_opts(pfd, &opts); + if (err || ctx.skel->bss->status) { + fprintf(stderr, "failed to run setup prog: err %d, status %d\n", + err, ctx.skel->bss->status); + crypto_bench__destroy(ctx.skel); + exit(1); + } +} + +static void crypto_encrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_encrypt); +} + +static void crypto_decrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_decrypt); +} + +static void crypto_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void *crypto_producer(void *unused) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .repeat = 64, + .data_in = input, + .data_size_in = args.crypto_len, + ); + + while (true) + (void)bpf_prog_test_run_opts(ctx.pfd, &opts); + return NULL; +} + +const struct bench bench_crypto_encrypt = { + .name = "crypto-encrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_encrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_crypto_decrypt = { + .name = "crypto-decrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_decrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index d66eddacd642..4b05539f167d 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -1,11 +1,57 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ #define _GNU_SOURCE +#include <argp.h> #include <unistd.h> +#include <stdint.h> #include "bench.h" #include "trigger_bench.skel.h" #include "trace_helpers.h" +#define MAX_TRIG_BATCH_ITERS 1000 + +static struct { + __u32 batch_iters; +} args = { + .batch_iters = 100, +}; + +enum { + ARG_TRIG_BATCH_ITERS = 7000, +}; + +static const struct argp_option opts[] = { + { "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0, + "Number of in-kernel iterations per one driver test run"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_TRIG_BATCH_ITERS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > MAX_TRIG_BATCH_ITERS) { + fprintf(stderr, "invalid --trig-batch-iters value (should be between %d and %d)\n", + 1, MAX_TRIG_BATCH_ITERS); + argp_usage(state); + } + args.batch_iters = ret; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_trigger_batch_argp = { + .options = opts, + .parser = parse_arg, +}; + /* adjust slot shift in inc_hits() if changing */ #define MAX_BUCKETS 256 @@ -14,6 +60,8 @@ /* BPF triggering benchmarks */ static struct trigger_ctx { struct trigger_bench *skel; + bool usermode_counters; + int driver_prog_fd; } ctx; static struct counter base_hits[MAX_BUCKETS]; @@ -51,41 +99,63 @@ static void trigger_validate(void) } } -static void *trigger_base_producer(void *input) +static void *trigger_producer(void *input) { - while (true) { - (void)syscall(__NR_getpgid); - inc_counter(base_hits); + if (ctx.usermode_counters) { + while (true) { + (void)syscall(__NR_getpgid); + inc_counter(base_hits); + } + } else { + while (true) + (void)syscall(__NR_getpgid); } return NULL; } -static void trigger_base_measure(struct bench_res *res) +static void *trigger_producer_batch(void *input) { - res->hits = sum_and_reset_counters(base_hits); -} + int fd = ctx.driver_prog_fd ?: bpf_program__fd(ctx.skel->progs.trigger_driver); -static void *trigger_producer(void *input) -{ while (true) - (void)syscall(__NR_getpgid); + bpf_prog_test_run_opts(fd, NULL); + return NULL; } static void trigger_measure(struct bench_res *res) { - res->hits = sum_and_reset_counters(ctx.skel->bss->hits); + if (ctx.usermode_counters) + res->hits = sum_and_reset_counters(base_hits); + else + res->hits = sum_and_reset_counters(ctx.skel->bss->hits); } static void setup_ctx(void) { setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + + /* default "driver" BPF program */ + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); + + ctx.skel->rodata->batch_iters = args.batch_iters; +} + +static void load_ctx(void) +{ + int err; + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } } static void attach_bpf(struct bpf_program *prog) @@ -99,64 +169,104 @@ static void attach_bpf(struct bpf_program *prog) } } -static void trigger_tp_setup(void) +static void trigger_syscall_count_setup(void) { - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_tp); + ctx.usermode_counters = true; } -static void trigger_rawtp_setup(void) +/* Batched, staying mostly in-kernel triggering setups */ +static void trigger_kernel_count_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_count, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); } static void trigger_kprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe); } static void trigger_kretprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe); } static void trigger_kprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi); } static void trigger_kretprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi); } static void trigger_fentry_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fentry); } static void trigger_fexit_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fexit); } -static void trigger_fentry_sleep_setup(void) +static void trigger_fmodret_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); } -static void trigger_fmodret_setup(void) +static void trigger_tp_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fmodret); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_tp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_rawtp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_rawtp); } /* make sure call is not inlined and not avoided by compiler, so __weak and @@ -192,7 +302,7 @@ __nocf_check __weak void uprobe_target_ret(void) asm volatile (""); } -static void *uprobe_base_producer(void *input) +static void *uprobe_producer_count(void *input) { while (true) { uprobe_target_nop(); @@ -226,15 +336,24 @@ static void usetup(bool use_retprobe, void *target_addr) { size_t uprobe_offset; struct bpf_link *link; + int err; setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + uprobe_offset = get_uprobe_offset(target_addr); link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, use_retprobe, @@ -248,204 +367,90 @@ static void usetup(bool use_retprobe, void *target_addr) ctx.skel->links.bench_trigger_uprobe = link; } -static void uprobe_setup_nop(void) +static void usermode_count_setup(void) +{ + ctx.usermode_counters = true; +} + +static void uprobe_nop_setup(void) { usetup(false, &uprobe_target_nop); } -static void uretprobe_setup_nop(void) +static void uretprobe_nop_setup(void) { usetup(true, &uprobe_target_nop); } -static void uprobe_setup_push(void) +static void uprobe_push_setup(void) { usetup(false, &uprobe_target_push); } -static void uretprobe_setup_push(void) +static void uretprobe_push_setup(void) { usetup(true, &uprobe_target_push); } -static void uprobe_setup_ret(void) +static void uprobe_ret_setup(void) { usetup(false, &uprobe_target_ret); } -static void uretprobe_setup_ret(void) +static void uretprobe_ret_setup(void) { usetup(true, &uprobe_target_ret); } -const struct bench bench_trig_base = { - .name = "trig-base", - .validate = trigger_validate, - .producer_thread = trigger_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_tp = { - .name = "trig-tp", - .validate = trigger_validate, - .setup = trigger_tp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_rawtp = { - .name = "trig-rawtp", - .validate = trigger_validate, - .setup = trigger_rawtp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe = { - .name = "trig-kprobe", - .validate = trigger_validate, - .setup = trigger_kprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe = { - .name = "trig-kretprobe", - .validate = trigger_validate, - .setup = trigger_kretprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe_multi = { - .name = "trig-kprobe-multi", - .validate = trigger_validate, - .setup = trigger_kprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe_multi = { - .name = "trig-kretprobe-multi", - .validate = trigger_validate, - .setup = trigger_kretprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry = { - .name = "trig-fentry", - .validate = trigger_validate, - .setup = trigger_fentry_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fexit = { - .name = "trig-fexit", - .validate = trigger_validate, - .setup = trigger_fexit_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry_sleep = { - .name = "trig-fentry-sleep", - .validate = trigger_validate, - .setup = trigger_fentry_sleep_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fmodret = { - .name = "trig-fmodret", +const struct bench bench_trig_syscall_count = { + .name = "trig-syscall-count", .validate = trigger_validate, - .setup = trigger_fmodret_setup, + .setup = trigger_syscall_count_setup, .producer_thread = trigger_producer, .measure = trigger_measure, .report_progress = hits_drops_report_progress, .report_final = hits_drops_report_final, }; -const struct bench bench_trig_uprobe_base = { - .name = "trig-uprobe-base", - .setup = NULL, /* no uprobe/uretprobe is attached */ - .producer_thread = uprobe_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_nop = { - .name = "trig-uprobe-nop", - .setup = uprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_nop = { - .name = "trig-uretprobe-nop", - .setup = uretprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_push = { - .name = "trig-uprobe-push", - .setup = uprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_push = { - .name = "trig-uretprobe-push", - .setup = uretprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_ret = { - .name = "trig-uprobe-ret", - .setup = uprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_ret = { - .name = "trig-uretprobe-ret", - .setup = uretprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; +/* batched (staying mostly in kernel) kprobe/fentry benchmarks */ +#define BENCH_TRIG_KERNEL(KIND, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .setup = trigger_##KIND##_setup, \ + .producer_thread = trigger_producer_batch, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ + .argp = &bench_trigger_batch_argp, \ +} + +BENCH_TRIG_KERNEL(kernel_count, "kernel-count"); +BENCH_TRIG_KERNEL(kprobe, "kprobe"); +BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); +BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); +BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); +BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(fexit, "fexit"); +BENCH_TRIG_KERNEL(fmodret, "fmodret"); +BENCH_TRIG_KERNEL(tp, "tp"); +BENCH_TRIG_KERNEL(rawtp, "rawtp"); + +/* uprobe benchmarks */ +#define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .validate = trigger_validate, \ + .setup = KIND##_setup, \ + .producer_thread = uprobe_producer_##PRODUCER, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ +} + +BENCH_TRIG_USERMODE(usermode_count, count, "usermode-count"); +BENCH_TRIG_USERMODE(uprobe_nop, nop, "uprobe-nop"); +BENCH_TRIG_USERMODE(uprobe_push, push, "uprobe-push"); +BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret"); +BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); +BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); +BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 78e83f243294..a690f5a68b6b 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -2,8 +2,22 @@ set -eufo pipefail -for i in base tp rawtp kprobe fentry fmodret -do - summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) - printf "%-10s: %s\n" $i "$summary" +def_tests=( \ + usermode-count kernel-count syscall-count \ + fentry fexit fmodret \ + rawtp tp \ + kprobe kprobe-multi \ + kretprobe kretprobe-multi \ +) + +tests=("$@") +if [ ${#tests[@]} -eq 0 ]; then + tests=("${def_tests[@]}") +fi + +p=${PROD_CNT:-1} + +for t in "${tests[@]}"; do + summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-15s: %s\n" $t "$summary" done diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index 9bdcc74e03a4..af169f831f2f 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in base {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index a5b9df38c162..8b9cc87be4c4 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -326,6 +326,16 @@ l_true: \ }) #endif +#ifdef __BPF_FEATURE_MAY_GOTO +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else #define cond_break \ ({ __label__ l_break, l_continue; \ asm volatile goto("1:.byte 0xe5; \ @@ -337,6 +347,7 @@ l_true: \ l_break: break; \ l_continue:; \ }) +#endif #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ @@ -386,6 +397,28 @@ l_true: \ , [as]"i"((dst_as << 16) | src_as)); #endif +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; + +typedef struct { +} __bpf_preempt_t; + +static inline __bpf_preempt_t __bpf_preempt_constructor(void) +{ + __bpf_preempt_t ret = {}; + + bpf_preempt_disable(); + return ret; +} +static inline void __bpf_preempt_destructor(__bpf_preempt_t *t) +{ + bpf_preempt_enable(); +} +#define bpf_guard_preempt() \ + __bpf_preempt_t ___bpf_apply(preempt, __COUNTER__) \ + __attribute__((__unused__, __cleanup__(__bpf_preempt_destructor))) = \ + __bpf_preempt_constructor() + /* Description * Assert that a conditional expression is true. * Returns @@ -459,4 +492,11 @@ extern int bpf_iter_css_new(struct bpf_iter_css *it, extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; +extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; +extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; +extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags__k, void *aux__ign) __ksym; +#define bpf_wq_set_callback(timer, cb, flags) \ + bpf_wq_set_callback_impl(timer, cb, flags, NULL) #endif diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 39ad96a18123..eb2b78552ca2 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -494,6 +494,10 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused return arg; } +__bpf_kfunc void bpf_kfunc_call_test_sleepable(void) +{ +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -520,6 +524,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index 7c664dd61059..ce5cd763561c 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -96,6 +96,7 @@ void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym; void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym; void bpf_kfunc_call_test_destructive(void) __ksym; +void bpf_kfunc_call_test_sleepable(void) __ksym; void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p); struct prog_test_member *bpf_kfunc_call_memb_acquire(void); diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 19be9c63d5e8..f2952a65dcc2 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -429,7 +429,7 @@ int create_and_get_cgroup(const char *relative_path) * which is an invalid cgroup id. * If there is a failure, it prints the error to stderr. */ -unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) +static unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) { int dirfd, err, flags, mount_id, fhsize; union { diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 01f241ea2c67..eeabd798bc3a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -13,7 +13,12 @@ CONFIG_BPF_SYSCALL=y CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_USER_API=y CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_USER_API_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_AES=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF4=y @@ -88,3 +93,5 @@ CONFIG_VSOCKETS=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y CONFIG_XFRM_INTERFACE=y +CONFIG_TCP_CONG_DCTCP=y +CONFIG_TCP_CONG_BBR=y diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 6db27a9088e9..a9e4905a2115 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -52,6 +52,8 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; +static const struct network_helper_opts default_opts; + int settimeo(int fd, int timeout_ms) { struct timeval timeout = { .tv_sec = 3 }; @@ -185,6 +187,16 @@ close_fds: return NULL; } +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts) +{ + if (!opts) + opts = &default_opts; + + return __start_server(type, 0, (struct sockaddr *)addr, len, + opts->timeout_ms, 0); +} + void free_fds(int *fds, unsigned int nr_close_fds) { if (fds) { @@ -258,17 +270,24 @@ static int connect_fd_to_addr(int fd, return 0; } -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t addrlen, int type) +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { int fd; - fd = socket(addr->ss_family, type, 0); + if (!opts) + opts = &default_opts; + + fd = socket(addr->ss_family, type, opts->proto); if (fd < 0) { log_err("Failed to create client socket"); return -1; } - if (connect_fd_to_addr(fd, addr, addrlen, false)) + if (settimeo(fd, opts->timeout_ms)) + goto error_close; + + if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail)) goto error_close; return fd; @@ -278,8 +297,6 @@ error_close: return -1; } -static const struct network_helper_opts default_opts; - int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; @@ -442,25 +459,35 @@ struct nstoken *open_netns(const char *name) struct nstoken *token; token = calloc(1, sizeof(struct nstoken)); - if (!ASSERT_OK_PTR(token, "malloc token")) + if (!token) { + log_err("Failed to malloc token"); return NULL; + } token->orig_netns_fd = open("/proc/self/ns/net", O_RDONLY); - if (!ASSERT_GE(token->orig_netns_fd, 0, "open /proc/self/ns/net")) + if (token->orig_netns_fd == -1) { + log_err("Failed to open(/proc/self/ns/net)"); goto fail; + } snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); nsfd = open(nspath, O_RDONLY | O_CLOEXEC); - if (!ASSERT_GE(nsfd, 0, "open netns fd")) + if (nsfd == -1) { + log_err("Failed to open(%s)", nspath); goto fail; + } err = setns(nsfd, CLONE_NEWNET); close(nsfd); - if (!ASSERT_OK(err, "setns")) + if (err) { + log_err("Failed to setns(nsfd)"); goto fail; + } return token; fail: + if (token->orig_netns_fd != -1) + close(token->orig_netns_fd); free(token); return NULL; } @@ -470,7 +497,8 @@ void close_netns(struct nstoken *token) if (!token) return; - ASSERT_OK(setns(token->orig_netns_fd, CLONE_NEWNET), "setns"); + if (setns(token->orig_netns_fd, CLONE_NEWNET)) + log_err("Failed to setns(orig_netns_fd)"); close(token->orig_netns_fd); free(token); } @@ -497,3 +525,153 @@ int get_socket_local_port(int sock_fd) return -1; } + +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_GRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} + +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_SRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} + +struct send_recv_arg { + int fd; + uint32_t bytes; + int stop; +}; + +static void *send_recv_server(void *arg) +{ + struct send_recv_arg *a = (struct send_recv_arg *)arg; + ssize_t nr_sent = 0, bytes = 0; + char batch[1500]; + int err = 0, fd; + + fd = accept(a->fd, NULL, NULL); + while (fd == -1) { + if (errno == EINTR) + continue; + err = -errno; + goto done; + } + + if (settimeo(fd, 0)) { + err = -errno; + goto done; + } + + while (bytes < a->bytes && !READ_ONCE(a->stop)) { + nr_sent = send(fd, &batch, + MIN(a->bytes - bytes, sizeof(batch)), 0); + if (nr_sent == -1 && errno == EINTR) + continue; + if (nr_sent == -1) { + err = -errno; + break; + } + bytes += nr_sent; + } + + if (bytes != a->bytes) { + log_err("send %zd expected %u", bytes, a->bytes); + if (!err) + err = bytes > a->bytes ? -E2BIG : -EINTR; + } + +done: + if (fd >= 0) + close(fd); + if (err) { + WRITE_ONCE(a->stop, 1); + return ERR_PTR(err); + } + return NULL; +} + +int send_recv_data(int lfd, int fd, uint32_t total_bytes) +{ + ssize_t nr_recv = 0, bytes = 0; + struct send_recv_arg arg = { + .fd = lfd, + .bytes = total_bytes, + .stop = 0, + }; + pthread_t srv_thread; + void *thread_ret; + char batch[1500]; + int err = 0; + + err = pthread_create(&srv_thread, NULL, send_recv_server, (void *)&arg); + if (err) { + log_err("Failed to pthread_create"); + return err; + } + + /* recv total_bytes */ + while (bytes < total_bytes && !READ_ONCE(arg.stop)) { + nr_recv = recv(fd, &batch, + MIN(total_bytes - bytes, sizeof(batch)), 0); + if (nr_recv == -1 && errno == EINTR) + continue; + if (nr_recv == -1) { + err = -errno; + break; + } + bytes += nr_recv; + } + + if (bytes != total_bytes) { + log_err("recv %zd expected %u", bytes, total_bytes); + if (!err) + err = bytes > total_bytes ? -E2BIG : -EINTR; + } + + WRITE_ONCE(arg.stop, 1); + pthread_join(srv_thread, &thread_ret); + if (IS_ERR(thread_ret)) { + log_err("Failed in thread_ret %ld", PTR_ERR(thread_ret)); + err = err ? : PTR_ERR(thread_ret); + } + + return err; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 94b9be24e39b..5a8c5cf4ec1a 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -9,8 +9,12 @@ typedef __u16 __sum16; #include <linux/if_packet.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> +#include <linux/err.h> #include <netinet/tcp.h> #include <bpf/bpf_endian.h> +#include <net/if.h> #define MAGIC_VAL 0x1234 #define NUM_ITER 100000 @@ -50,8 +54,11 @@ int start_mptcp_server(int family, const char *addr, __u16 port, int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens); +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); void free_fds(int *fds, unsigned int nr_close_fds); -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t len, int type); +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); @@ -61,6 +68,8 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, struct sockaddr_storage *addr, socklen_t *len); char *ping_command(int family); int get_socket_local_port(int sock_fd); +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); struct nstoken; /** @@ -71,6 +80,7 @@ struct nstoken; */ struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); +int send_recv_data(int lfd, int fd, uint32_t total_bytes); static __u16 csum_fold(__u32 csum) { diff --git a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c new file mode 100644 index 000000000000..0807a48a58ee --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include "arena_atomics.skel.h" + +static void test_add(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->add64_value, 3, "add64_value"); + ASSERT_EQ(skel->arena->add64_result, 1, "add64_result"); + + ASSERT_EQ(skel->arena->add32_value, 3, "add32_value"); + ASSERT_EQ(skel->arena->add32_result, 1, "add32_result"); + + ASSERT_EQ(skel->arena->add_stack_value_copy, 3, "add_stack_value"); + ASSERT_EQ(skel->arena->add_stack_result, 1, "add_stack_result"); + + ASSERT_EQ(skel->arena->add_noreturn_value, 3, "add_noreturn_value"); +} + +static void test_sub(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.sub); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->sub64_value, -1, "sub64_value"); + ASSERT_EQ(skel->arena->sub64_result, 1, "sub64_result"); + + ASSERT_EQ(skel->arena->sub32_value, -1, "sub32_value"); + ASSERT_EQ(skel->arena->sub32_result, 1, "sub32_result"); + + ASSERT_EQ(skel->arena->sub_stack_value_copy, -1, "sub_stack_value"); + ASSERT_EQ(skel->arena->sub_stack_result, 1, "sub_stack_result"); + + ASSERT_EQ(skel->arena->sub_noreturn_value, -1, "sub_noreturn_value"); +} + +static void test_and(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.and); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->and64_value, 0x010ull << 32, "and64_value"); + ASSERT_EQ(skel->arena->and32_value, 0x010, "and32_value"); +} + +static void test_or(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.or); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->or64_value, 0x111ull << 32, "or64_value"); + ASSERT_EQ(skel->arena->or32_value, 0x111, "or32_value"); +} + +static void test_xor(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xor); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xor64_value, 0x101ull << 32, "xor64_value"); + ASSERT_EQ(skel->arena->xor32_value, 0x101, "xor32_value"); +} + +static void test_cmpxchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.cmpxchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->cmpxchg64_value, 2, "cmpxchg64_value"); + ASSERT_EQ(skel->arena->cmpxchg64_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg64_result_succeed, 1, "cmpxchg_result_succeed"); + + ASSERT_EQ(skel->arena->cmpxchg32_value, 2, "lcmpxchg32_value"); + ASSERT_EQ(skel->arena->cmpxchg32_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg32_result_succeed, 1, "cmpxchg_result_succeed"); +} + +static void test_xchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xchg64_value, 2, "xchg64_value"); + ASSERT_EQ(skel->arena->xchg64_result, 1, "xchg64_result"); + + ASSERT_EQ(skel->arena->xchg32_value, 2, "xchg32_value"); + ASSERT_EQ(skel->arena->xchg32_result, 1, "xchg32_result"); +} + +void test_arena_atomics(void) +{ + struct arena_atomics *skel; + int err; + + skel = arena_atomics__open(); + if (!ASSERT_OK_PTR(skel, "arena atomics skeleton open")) + return; + + if (skel->data->skip_tests) { + printf("%s:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang", + __func__); + test__skip(); + goto cleanup; + } + err = arena_atomics__load(skel); + if (!ASSERT_OK(err, "arena atomics skeleton load")) + return; + skel->bss->pid = getpid(); + + if (test__start_subtest("add")) + test_add(skel); + if (test__start_subtest("sub")) + test_sub(skel); + if (test__start_subtest("and")) + test_and(skel); + if (test__start_subtest("or")) + test_or(skel); + if (test__start_subtest("xor")) + test_xor(skel); + if (test__start_subtest("cmpxchg")) + test_cmpxchg(skel); + if (test__start_subtest("xchg")) + test_xchg(skel); + +cleanup: + arena_atomics__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 94cb22b01482..907bac46c774 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -13,6 +13,7 @@ #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" #include "tcp_ca_unsupp_cong_op.skel.h" +#include "tcp_ca_kfunc.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -20,7 +21,6 @@ static const unsigned int total_bytes = 10 * 1024 * 1024; static int expected_stg = 0xeB9F; -static int stop; static int settcpca(int fd, const char *tcp_ca) { @@ -33,62 +33,11 @@ static int settcpca(int fd, const char *tcp_ca) return 0; } -static void *server(void *arg) -{ - int lfd = (int)(long)arg, err = 0, fd; - ssize_t nr_sent = 0, bytes = 0; - char batch[1500]; - - fd = accept(lfd, NULL, NULL); - while (fd == -1) { - if (errno == EINTR) - continue; - err = -errno; - goto done; - } - - if (settimeo(fd, 0)) { - err = -errno; - goto done; - } - - while (bytes < total_bytes && !READ_ONCE(stop)) { - nr_sent = send(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); - if (nr_sent == -1 && errno == EINTR) - continue; - if (nr_sent == -1) { - err = -errno; - break; - } - bytes += nr_sent; - } - - ASSERT_EQ(bytes, total_bytes, "send"); - -done: - if (fd >= 0) - close(fd); - if (err) { - WRITE_ONCE(stop, 1); - return ERR_PTR(err); - } - return NULL; -} - static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { - struct sockaddr_in6 sa6 = {}; - ssize_t nr_recv = 0, bytes = 0; int lfd = -1, fd = -1; - pthread_t srv_thread; - socklen_t addrlen = sizeof(sa6); - void *thread_ret; - char batch[1500]; int err; - WRITE_ONCE(stop, 0); - lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; @@ -99,12 +48,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) return; } - if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || - settimeo(lfd, 0) || settimeo(fd, 0)) - goto done; - - err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); - if (!ASSERT_NEQ(err, -1, "getsockname")) + if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca)) goto done; if (sk_stg_map) { @@ -115,7 +59,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) } /* connect to server */ - err = connect(fd, (struct sockaddr *)&sa6, addrlen); + err = connect_fd_to_fd(fd, lfd, 0); if (!ASSERT_NEQ(err, -1, "connect")) goto done; @@ -129,26 +73,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) goto done; } - err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); - if (!ASSERT_OK(err, "pthread_create")) - goto done; - - /* recv total_bytes */ - while (bytes < total_bytes && !READ_ONCE(stop)) { - nr_recv = recv(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); - if (nr_recv == -1 && errno == EINTR) - continue; - if (nr_recv == -1) - break; - bytes += nr_recv; - } - - ASSERT_EQ(bytes, total_bytes, "recv"); - - WRITE_ONCE(stop, 1); - pthread_join(srv_thread, &thread_ret); - ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); + ASSERT_OK(send_recv_data(lfd, fd, total_bytes), "send_recv_data"); done: close(lfd); @@ -304,7 +229,7 @@ static void test_rel_setsockopt(void) struct bpf_dctcp_release *rel_skel; libbpf_print_fn_t old_print_fn; - err_str = "unknown func bpf_setsockopt"; + err_str = "program of this type cannot use helper bpf_setsockopt"; found = false; old_print_fn = libbpf_set_print(libbpf_debug_print); @@ -518,6 +443,15 @@ static void test_link_replace(void) tcp_ca_update__destroy(skel); } +static void test_tcp_ca_kfunc(void) +{ + struct tcp_ca_kfunc *skel; + + skel = tcp_ca_kfunc__open_and_load(); + ASSERT_OK_PTR(skel, "tcp_ca_kfunc__open_and_load"); + tcp_ca_kfunc__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -546,4 +480,6 @@ void test_bpf_tcp_ca(void) test_multi_links(); if (test__start_subtest("link_replace")) test_link_replace(); + if (test__start_subtest("tcp_ca_kfunc")) + test_tcp_ca_kfunc(); } diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 2a55f717fc07..34b59f6baca1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -10,6 +10,7 @@ #include <netinet/tcp.h> #include <test_progs.h> +#include "network_helpers.h" #include "progs/test_cls_redirect.h" #include "test_cls_redirect.skel.h" @@ -35,39 +36,6 @@ struct tuple { struct addr_port dst; }; -static int start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto err; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - -static int connect_to_server(const struct sockaddr *addr, socklen_t len, - int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(connect(fd, addr, len))) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) { const struct sockaddr_in6 *in6; @@ -98,14 +66,14 @@ static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, socklen_t slen = sizeof(ss); struct sockaddr *sa = (struct sockaddr *)&ss; - *server = start_server(addr, len, type); + *server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (*server < 0) return false; if (CHECK_FAIL(getsockname(*server, sa, &slen))) goto close_server; - *conn = connect_to_server(sa, slen, type); + *conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL); if (*conn < 0) goto close_server; diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c new file mode 100644 index 000000000000..b1a3a49a822a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <net/if.h> +#include <linux/in6.h> +#include <linux/if_alg.h> + +#include "test_progs.h" +#include "network_helpers.h" +#include "crypto_sanity.skel.h" +#include "crypto_basic.skel.h" + +#define NS_TEST "crypto_sanity_ns" +#define IPV6_IFACE_ADDR "face::1" +static const unsigned char crypto_key[] = "testtest12345678"; +static const char plain_text[] = "stringtoencrypt0"; +static int opfd = -1, tfmfd = -1; +static const char algo[] = "ecb(aes)"; +static int init_afalg(void) +{ + struct sockaddr_alg sa = { + .salg_family = AF_ALG, + .salg_type = "skcipher", + .salg_name = "ecb(aes)" + }; + + tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (tfmfd == -1) + return errno; + if (bind(tfmfd, (struct sockaddr *)&sa, sizeof(sa)) == -1) + return errno; + if (setsockopt(tfmfd, SOL_ALG, ALG_SET_KEY, crypto_key, 16) == -1) + return errno; + opfd = accept(tfmfd, NULL, 0); + if (opfd == -1) + return errno; + return 0; +} + +static void deinit_afalg(void) +{ + if (tfmfd != -1) + close(tfmfd); + if (opfd != -1) + close(opfd); +} + +static void do_crypt_afalg(const void *src, void *dst, int size, bool encrypt) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char cbuf[CMSG_SPACE(4)] = {0}; + struct iovec iov; + + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_ALG; + cmsg->cmsg_type = ALG_SET_OP; + cmsg->cmsg_len = CMSG_LEN(4); + *(__u32 *)CMSG_DATA(cmsg) = encrypt ? ALG_OP_ENCRYPT : ALG_OP_DECRYPT; + + iov.iov_base = (char *)src; + iov.iov_len = size; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + sendmsg(opfd, &msg, 0); + read(opfd, dst, size); +} + +void test_crypto_basic(void) +{ + RUN_TESTS(crypto_basic); +} + +void test_crypto_sanity(void) +{ + LIBBPF_OPTS(bpf_tc_hook, qdisc_hook, .attach_point = BPF_TC_EGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_enc); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_dec); + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct nstoken *nstoken = NULL; + struct crypto_sanity *skel; + char afalg_plain[16] = {0}; + char afalg_dst[16] = {0}; + struct sockaddr_in6 addr; + int sockfd, err, pfd; + socklen_t addrlen; + u16 udp_test_port; + + skel = crypto_sanity__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open")) + return; + + SYS(fail, "ip netns add %s", NS_TEST); + SYS(fail, "ip -net %s -6 addr add %s/128 dev lo nodad", NS_TEST, IPV6_IFACE_ADDR); + SYS(fail, "ip -net %s link set dev lo up", NS_TEST); + + nstoken = open_netns(NS_TEST); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto fail; + + err = init_afalg(); + if (!ASSERT_OK(err, "AF_ALG init fail")) + goto fail; + + qdisc_hook.ifindex = if_nametoindex("lo"); + if (!ASSERT_GT(qdisc_hook.ifindex, 0, "if_nametoindex lo")) + goto fail; + + skel->bss->key_len = 16; + skel->bss->authsize = 0; + udp_test_port = skel->data->udp_test_port; + memcpy(skel->bss->key, crypto_key, sizeof(crypto_key)); + snprintf(skel->bss->algo, 128, "%s", algo); + pfd = bpf_program__fd(skel->progs.skb_crypto_setup); + if (!ASSERT_GT(pfd, 0, "skb_crypto_setup fd")) + goto fail; + + err = bpf_prog_test_run_opts(pfd, &opts); + if (!ASSERT_OK(err, "skb_crypto_setup") || + !ASSERT_OK(opts.retval, "skb_crypto_setup retval")) + goto fail; + + if (!ASSERT_OK(skel->bss->status, "skb_crypto_setup status")) + goto fail; + + err = bpf_tc_hook_create(&qdisc_hook); + if (!ASSERT_OK(err, "create qdisc hook")) + goto fail; + + addrlen = sizeof(addr); + err = make_sockaddr(AF_INET6, IPV6_IFACE_ADDR, udp_test_port, + (void *)&addr, &addrlen); + if (!ASSERT_OK(err, "make_sockaddr")) + goto fail; + + tc_attach_enc.prog_fd = bpf_program__fd(skel->progs.encrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "attach encrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "encrypt socket")) + goto fail; + err = sendto(sockfd, plain_text, sizeof(plain_text), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(plain_text), "encrypt send")) + goto fail; + + do_crypt_afalg(plain_text, afalg_dst, sizeof(afalg_dst), true); + + if (!ASSERT_OK(skel->bss->status, "encrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_dst, sizeof(afalg_dst), "encrypt AF_ALG")) + goto fail; + + tc_attach_enc.flags = tc_attach_enc.prog_fd = tc_attach_enc.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "bpf_tc_detach encrypt")) + goto fail; + + tc_attach_dec.prog_fd = bpf_program__fd(skel->progs.decrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_dec); + if (!ASSERT_OK(err, "attach decrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "decrypt socket")) + goto fail; + err = sendto(sockfd, afalg_dst, sizeof(afalg_dst), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(afalg_dst), "decrypt send")) + goto fail; + + do_crypt_afalg(afalg_dst, afalg_plain, sizeof(afalg_plain), false); + + if (!ASSERT_OK(skel->bss->status, "decrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_plain, sizeof(afalg_plain), "decrypt AF_ALG")) + goto fail; + + tc_attach_dec.flags = tc_attach_dec.prog_fd = tc_attach_dec.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_dec); + ASSERT_OK(err, "bpf_tc_detach decrypt"); + +fail: + close_netns(nstoken); + deinit_afalg(); + SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null"); + crypto_sanity__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c index f43fcb13d2c4..d3d94596ab79 100644 --- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c @@ -98,7 +98,8 @@ done: static void test_dummy_multiple_args(void) { - __u64 args[5] = {0, -100, 0x8a5f, 'c', 0x1234567887654321ULL}; + struct bpf_dummy_ops_state st = { 7 }; + __u64 args[5] = {(__u64)&st, -100, 0x8a5f, 'c', 0x1234567887654321ULL}; LIBBPF_OPTS(bpf_test_run_opts, attr, .ctx_in = args, .ctx_size_in = sizeof(args), @@ -115,6 +116,7 @@ static void test_dummy_multiple_args(void) fd = bpf_program__fd(skel->progs.test_2); err = bpf_prog_test_run_opts(fd, &attr); ASSERT_OK(err, "test_run"); + args[0] = 7; for (i = 0; i < ARRAY_SIZE(args); i++) { snprintf(name, sizeof(name), "arg %zu", i); ASSERT_EQ(skel->bss->test_2_args[i], args[i], name); @@ -125,7 +127,8 @@ static void test_dummy_multiple_args(void) static void test_dummy_sleepable(void) { - __u64 args[1] = {0}; + struct bpf_dummy_ops_state st; + __u64 args[1] = {(__u64)&st}; LIBBPF_OPTS(bpf_test_run_opts, attr, .ctx_in = args, .ctx_size_in = sizeof(args), @@ -144,6 +147,31 @@ static void test_dummy_sleepable(void) dummy_st_ops_success__destroy(skel); } +/* dummy_st_ops.test_sleepable() parameter is not marked as nullable, + * thus bpf_prog_test_run_opts() below should be rejected as it tries + * to pass NULL for this parameter. + */ +static void test_dummy_sleepable_reject_null(void) +{ + __u64 args[1] = {0}; + LIBBPF_OPTS(bpf_test_run_opts, attr, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + struct dummy_st_ops_success *skel; + int fd, err; + + skel = dummy_st_ops_success__open_and_load(); + if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load")) + return; + + fd = bpf_program__fd(skel->progs.test_sleepable); + err = bpf_prog_test_run_opts(fd, &attr); + ASSERT_EQ(err, -EINVAL, "test_run"); + + dummy_st_ops_success__destroy(skel); +} + void test_dummy_st_ops(void) { if (test__start_subtest("dummy_st_ops_attach")) @@ -156,6 +184,8 @@ void test_dummy_st_ops(void) test_dummy_multiple_args(); if (test__start_subtest("dummy_sleepable")) test_dummy_sleepable(); + if (test__start_subtest("dummy_sleepable_reject_null")) + test_dummy_sleepable_reject_null(); RUN_TESTS(dummy_st_ops_fail); } diff --git a/tools/testing/selftests/bpf/prog_tests/empty_skb.c b/tools/testing/selftests/bpf/prog_tests/empty_skb.c index 261228eb68e8..438583e1f2d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/empty_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/empty_skb.c @@ -94,6 +94,8 @@ void test_empty_skb(void) SYS(out, "ip netns add empty_skb"); tok = open_netns("empty_skb"); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add veth0 type veth peer veth1"); SYS(out, "ip link set dev veth0 up"); SYS(out, "ip link set dev veth1 up"); diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c index 3379df2d4cf2..bd7658958004 100644 --- a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c @@ -26,6 +26,17 @@ #define IPV6_TBID_ADDR "fd00::FFFF" #define IPV6_TBID_NET "fd00::" #define IPV6_TBID_DST "fd00::2" +#define MARK_NO_POLICY 33 +#define MARK 42 +#define MARK_TABLE "200" +#define IPV4_REMOTE_DST "1.2.3.4" +#define IPV4_LOCAL "10.4.0.3" +#define IPV4_GW1 "10.4.0.1" +#define IPV4_GW2 "10.4.0.2" +#define IPV6_REMOTE_DST "be:ef::b0:10" +#define IPV6_LOCAL "fd01::3" +#define IPV6_GW1 "fd01::1" +#define IPV6_GW2 "fd01::2" #define DMAC "11:11:11:11:11:11" #define DMAC_INIT { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, } #define DMAC2 "01:01:01:01:01:01" @@ -36,9 +47,11 @@ struct fib_lookup_test { const char *daddr; int expected_ret; const char *expected_src; + const char *expected_dst; int lookup_flags; __u32 tbid; __u8 dmac[6]; + __u32 mark; }; static const struct fib_lookup_test tests[] = { @@ -90,10 +103,47 @@ static const struct fib_lookup_test tests[] = { .daddr = IPV6_ADDR_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, .expected_src = IPV6_IFACE_ADDR_SEC, .lookup_flags = BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + /* policy routing */ + { .desc = "IPv4 policy routing, default", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv4 policy routing, mark doesn't point to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv4 policy routing, mark points to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv4 policy routing, mark points to a policy, but no flag", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, default", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv6 policy routing, mark doesn't point to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv6 policy routing, mark points to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, mark points to a policy, but no flag", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, }; -static int ifindex; - static int setup_netns(void) { int err; @@ -144,12 +194,24 @@ static int setup_netns(void) if (!ASSERT_OK(err, "write_sysctl(net.ipv6.conf.veth1.forwarding)")) goto fail; + /* Setup for policy routing tests */ + SYS(fail, "ip addr add %s/24 dev veth1", IPV4_LOCAL); + SYS(fail, "ip addr add %s/64 dev veth1 nodad", IPV6_LOCAL); + SYS(fail, "ip route add %s/32 via %s", IPV4_REMOTE_DST, IPV4_GW1); + SYS(fail, "ip route add %s/32 via %s table %s", IPV4_REMOTE_DST, IPV4_GW2, MARK_TABLE); + SYS(fail, "ip -6 route add %s/128 via %s", IPV6_REMOTE_DST, IPV6_GW1); + SYS(fail, "ip -6 route add %s/128 via %s table %s", IPV6_REMOTE_DST, IPV6_GW2, MARK_TABLE); + SYS(fail, "ip rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + SYS(fail, "ip -6 rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + return 0; fail: return -1; } -static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_lookup_test *test) +static int set_lookup_params(struct bpf_fib_lookup *params, + const struct fib_lookup_test *test, + int ifindex) { int ret; @@ -158,6 +220,7 @@ static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_loo params->l4_protocol = IPPROTO_TCP; params->ifindex = ifindex; params->tbid = test->tbid; + params->mark = test->mark; if (inet_pton(AF_INET6, test->daddr, params->ipv6_dst) == 1) { params->family = AF_INET6; @@ -190,40 +253,45 @@ static void mac_str(char *b, const __u8 *mac) mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); } -static void assert_src_ip(struct bpf_fib_lookup *fib_params, const char *expected_src) +static void assert_ip_address(int family, void *addr, const char *expected_str) { + char str[INET6_ADDRSTRLEN]; + u8 expected_addr[16]; + int addr_len = 0; int ret; - __u32 src6[4]; - __be32 src4; - switch (fib_params->family) { + switch (family) { case AF_INET6: - ret = inet_pton(AF_INET6, expected_src, src6); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ret = memcmp(src6, fib_params->ipv6_src, sizeof(fib_params->ipv6_src)); - if (!ASSERT_EQ(ret, 0, "fib_lookup ipv6 src")) { - char str_src6[64]; - - inet_ntop(AF_INET6, fib_params->ipv6_src, str_src6, - sizeof(str_src6)); - printf("ipv6 expected %s actual %s ", expected_src, - str_src6); - } - + ret = inet_pton(AF_INET6, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET6, expected_str)"); + addr_len = 16; break; case AF_INET: - ret = inet_pton(AF_INET, expected_src, &src4); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ASSERT_EQ(fib_params->ipv4_src, src4, "fib_lookup ipv4 src"); - + ret = inet_pton(AF_INET, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET, expected_str)"); + addr_len = 4; break; default: - PRINT_FAIL("invalid addr family: %d", fib_params->family); + PRINT_FAIL("invalid address family: %d", family); + break; + } + + if (memcmp(addr, expected_addr, addr_len)) { + inet_ntop(family, addr, str, sizeof(str)); + PRINT_FAIL("expected %s actual %s ", expected_str, str); } } +static void assert_src_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_src, expected); +} + +static void assert_dst_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_dst, expected); +} + void test_fib_lookup(void) { struct bpf_fib_lookup *fib_params; @@ -256,15 +324,18 @@ void test_fib_lookup(void) if (setup_netns()) goto fail; - ifindex = if_nametoindex("veth1"); - skb.ifindex = ifindex; + skb.ifindex = if_nametoindex("veth1"); + if (!ASSERT_NEQ(skb.ifindex, 0, "if_nametoindex(veth1)")) + goto fail; + fib_params = &skel->bss->fib_params; for (i = 0; i < ARRAY_SIZE(tests); i++) { printf("Testing %s ", tests[i].desc); - if (set_lookup_params(fib_params, &tests[i])) + if (set_lookup_params(fib_params, &tests[i], skb.ifindex)) continue; + skel->bss->fib_lookup_ret = -1; skel->bss->lookup_flags = tests[i].lookup_flags; @@ -278,6 +349,9 @@ void test_fib_lookup(void) if (tests[i].expected_src) assert_src_ip(fib_params, tests[i].expected_src); + if (tests[i].expected_dst) + assert_dst_ip(fib_params, tests[i].expected_dst); + ret = memcmp(tests[i].dmac, fib_params->dmac, sizeof(tests[i].dmac)); if (!ASSERT_EQ(ret, 0, "dmac not match")) { char expected[18], actual[18]; diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index c4773173a4e4..9e5f38739104 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -2,7 +2,6 @@ #include <test_progs.h> #include <network_helpers.h> #include <error.h> -#include <linux/if.h> #include <linux/if_tun.h> #include <sys/uio.h> diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index 8963f8a549f2..09f6487f58b9 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -5,6 +5,7 @@ #include "for_each_hash_map_elem.skel.h" #include "for_each_array_map_elem.skel.h" #include "for_each_map_elem_write_key.skel.h" +#include "for_each_multi_maps.skel.h" static unsigned int duration; @@ -143,6 +144,65 @@ static void test_write_map_key(void) for_each_map_elem_write_key__destroy(skel); } +static void test_multi_maps(void) +{ + struct for_each_multi_maps *skel; + __u64 val, array_total, hash_total; + __u32 key, max_entries; + int i, err; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + skel = for_each_multi_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_multi_maps__open_and_load")) + return; + + array_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + array_total += val; + err = bpf_map__update_elem(skel->maps.arraymap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "array_map_update")) + goto out; + } + + hash_total = 0; + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i + 100; + val = i + 1; + hash_total += val; + err = bpf_map__update_elem(skel->maps.hashmap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "hash_map_update")) + goto out; + } + + skel->bss->data_output = 0; + skel->bss->use_array = 1; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, array_total, "array output"); + + skel->bss->data_output = 0; + skel->bss->use_array = 0; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, hash_total, "hash output"); + +out: + for_each_multi_maps__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) @@ -151,4 +211,6 @@ void test_for_each(void) test_array_map(); if (test__start_subtest("write_map_key")) test_write_map_key(); + if (test__start_subtest("multi_maps")) + test_multi_maps(); } diff --git a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c index 8dd2af9081f4..284764e7179f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c +++ b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c @@ -88,6 +88,8 @@ static int attach(struct ip_check_defrag *skel, bool ipv6) int err = -1; nstoken = open_netns(NS1); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto out; skel->links.defrag = bpf_program__attach_netfilter(skel->progs.defrag, &opts); if (!ASSERT_OK_PTR(skel->links.defrag, "program attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 05000810e28e..51628455b6f5 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -336,15 +336,80 @@ static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) return strcmp((const char *) key1, (const char *) key2) == 0; } +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static bool skip_entry(char *name) +{ + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + return false; +} + +/* Do comparision by ignoring '.llvm.<hash>' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + static int get_syms(char ***symsp, size_t *cntp, bool kernel) { - size_t cap = 0, cnt = 0, i; - char *name = NULL, **syms = NULL; + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; char buf[256]; FILE *f; int err = 0; + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) + return -EINVAL; + /* * The available_filter_functions contains many duplicates, * but other than that all symbols are usable in kprobe multi @@ -368,33 +433,23 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) } while (fgets(buf, sizeof(buf), f)) { - if (kernel && strchr(buf, '[')) - continue; - if (!kernel && !strchr(buf, '[')) + if (is_invalid_entry(buf, kernel)) continue; free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - continue; - if (!strcmp(name, "default_idle")) - continue; - if (!strncmp(name, "rcu_", 4)) - continue; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - continue; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) + if (skip_entry(name)) continue; - err = hashmap__add(map, name, 0); + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); if (err == -EEXIST) { err = 0; continue; @@ -407,8 +462,7 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) if (err) goto error; - syms[cnt++] = name; - name = NULL; + syms[cnt++] = ksym_name; } *symsp = syms; @@ -418,42 +472,88 @@ error: free(name); fclose(f); hashmap__free(map); - if (err) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (err) free(syms); + return err; +} + +static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; + } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); return err; } -static void test_kprobe_multi_bench_attach(bool kernel) +static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { - LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); - struct kprobe_multi_empty *skel = NULL; long attach_start_ns, attach_end_ns; long detach_start_ns, detach_end_ns; double attach_delta, detach_delta; struct bpf_link *link = NULL; - char **syms = NULL; - size_t cnt = 0, i; - - if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) - return; - - skel = kprobe_multi_empty__open_and_load(); - if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) - goto cleanup; - - opts.syms = (const char **) syms; - opts.cnt = cnt; attach_start_ns = get_time_ns(); link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_empty, - NULL, &opts); + NULL, opts); attach_end_ns = get_time_ns(); if (!ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts")) - goto cleanup; + return; detach_start_ns = get_time_ns(); bpf_link__destroy(link); @@ -462,17 +562,65 @@ static void test_kprobe_multi_bench_attach(bool kernel) attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; - printf("%s: found %lu functions\n", __func__, cnt); + printf("%s: found %lu functions\n", __func__, opts->cnt); printf("%s: attached in %7.3lfs\n", __func__, attach_delta); printf("%s: detached in %7.3lfs\n", __func__, detach_delta); +} + +static void test_kprobe_multi_bench_attach(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + char **syms = NULL; + size_t cnt = 0; + + if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.syms = (const char **) syms; + opts.cnt = cnt; + + do_bench_test(skel, &opts); cleanup: kprobe_multi_empty__destroy(skel); - if (syms) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (syms) free(syms); +} + +static void test_kprobe_multi_bench_attach_addr(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + unsigned long *addrs = NULL; + size_t cnt = 0; + int err; + + err = get_addrs(&addrs, &cnt, kernel); + if (err == -ENOENT) { + test__skip(); + return; } + + if (!ASSERT_OK(err, "get_addrs")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.addrs = addrs; + opts.cnt = cnt; + + do_bench_test(skel, &opts); + +cleanup: + kprobe_multi_empty__destroy(skel); + free(addrs); } static void test_attach_override(void) @@ -515,6 +663,10 @@ void serial_test_kprobe_multi_bench_attach(void) test_kprobe_multi_bench_attach(true); if (test__start_subtest("modules")) test_kprobe_multi_bench_attach(false); + if (test__start_subtest("kernel")) + test_kprobe_multi_bench_attach_addr(true); + if (test__start_subtest("modules")) + test_kprobe_multi_bench_attach_addr(false); } void test_kprobe_multi_test(void) diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index b295969b263b..dc7aab532fb1 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -5,8 +5,6 @@ #include "test_ksyms.skel.h" #include <sys/stat.h> -static int duration; - void test_ksyms(void) { const char *btf_path = "/sys/kernel/btf/vmlinux"; @@ -18,43 +16,37 @@ void test_ksyms(void) int err; err = kallsyms_find("bpf_link_fops", &link_fops_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "bpf_link_fops: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "bpf_link_fops: ksym_find")) return; err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "__per_cpu_start: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "__per_cpu_start: ksym_find")) return; - if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno)) + if (!ASSERT_OK(stat(btf_path, &st), "stat_btf")) return; btf_size = st.st_size; skel = test_ksyms__open_and_load(); - if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) + if (!ASSERT_OK_PTR(skel, "test_ksyms__open_and_load")) return; err = test_ksyms__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ksyms__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); data = skel->data; - CHECK(data->out__bpf_link_fops != link_fops_addr, "bpf_link_fops", - "got 0x%llx, exp 0x%llx\n", - data->out__bpf_link_fops, link_fops_addr); - CHECK(data->out__bpf_link_fops1 != 0, "bpf_link_fops1", - "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0); - CHECK(data->out__btf_size != btf_size, "btf_size", - "got %llu, exp %llu\n", data->out__btf_size, btf_size); - CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start", - "got %llu, exp %llu\n", data->out__per_cpu_start, - per_cpu_start_addr); + ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops"); + ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1"); + ASSERT_EQ(data->out__btf_size, btf_size, "btf_size"); + ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start"); cleanup: test_ksyms__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 8f8d792307c1..4e0f69295872 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -273,6 +273,8 @@ static int run_mptcpify(int cgroup_fd) if (!ASSERT_OK_PTR(mptcpify_skel, "skel_open_load")) return libbpf_get_error(mptcpify_skel); + mptcpify_skel->bss->pid = getpid(); + err = mptcpify__attach(mptcpify_skel); if (!ASSERT_OK(err, "skel_attach")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/preempt_lock.c b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c new file mode 100644 index 000000000000..02917c672441 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> +#include <preempt_lock.skel.h> + +void test_preempt_lock(void) +{ + RUN_TESTS(preempt_lock); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 48c5695b7abf..4c6f42dae409 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -13,6 +13,7 @@ #include <linux/perf_event.h> #include <linux/ring_buffer.h> #include "test_ringbuf.lskel.h" +#include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #define EDONE 7777 @@ -326,6 +327,68 @@ cleanup: test_ringbuf_lskel__destroy(skel); } +/* + * Test ring_buffer__consume_n() by producing N_TOT_SAMPLES samples in the ring + * buffer, via getpid(), and consuming them in chunks of N_SAMPLES. + */ +#define N_TOT_SAMPLES 32 +#define N_SAMPLES 4 + +/* Sample value to verify the callback validity */ +#define SAMPLE_VALUE 42L + +static int process_n_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + ASSERT_EQ(s->value, SAMPLE_VALUE, "sample_value"); + + return 0; +} + +static void ringbuf_n_subtest(void) +{ + struct test_ringbuf_n_lskel *skel_n; + int err, i; + + skel_n = test_ringbuf_n_lskel__open(); + if (!ASSERT_OK_PTR(skel_n, "test_ringbuf_n_lskel__open")) + return; + + skel_n->maps.ringbuf.max_entries = getpagesize(); + skel_n->bss->pid = getpid(); + + err = test_ringbuf_n_lskel__load(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__load")) + goto cleanup; + + ringbuf = ring_buffer__new(skel_n->maps.ringbuf.map_fd, + process_n_sample, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new")) + goto cleanup; + + err = test_ringbuf_n_lskel__attach(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__attach")) + goto cleanup_ringbuf; + + /* Produce N_TOT_SAMPLES samples in the ring buffer by calling getpid() */ + skel_n->bss->value = SAMPLE_VALUE; + for (i = 0; i < N_TOT_SAMPLES; i++) + syscall(__NR_getpgid); + + /* Consume all samples from the ring buffer in batches of N_SAMPLES */ + for (i = 0; i < N_TOT_SAMPLES; i += err) { + err = ring_buffer__consume_n(ringbuf, N_SAMPLES); + if (!ASSERT_EQ(err, N_SAMPLES, "rb_consume")) + goto cleanup_ringbuf; + } + +cleanup_ringbuf: + ring_buffer__free(ringbuf); +cleanup: + test_ringbuf_n_lskel__destroy(skel_n); +} + static int process_map_key_sample(void *ctx, void *data, size_t len) { struct sample *s; @@ -384,6 +447,8 @@ void test_ringbuf(void) { if (test__start_subtest("ringbuf")) ringbuf_subtest(); + if (test__start_subtest("ringbuf_n")) + ringbuf_n_subtest(); if (test__start_subtest("ringbuf_map_key")) ringbuf_map_key_subtest(); } diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index b15b343ebb6b..920aee41bd58 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -179,7 +179,7 @@ static void test_send_signal_nmi(bool signal_thread) pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, -1 /* cpu */, -1 /* group_fd */, 0 /* flags */); if (pmu_fd == -1) { - if (errno == ENOENT) { + if (errno == ENOENT || errno == EOPNOTSUPP) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 1374b626a985..0b9bd1d6f7cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -15,6 +15,7 @@ #include <unistd.h> #include "test_progs.h" +#include "network_helpers.h" #define BIND_PORT 1234 #define CONNECT_PORT 4321 @@ -22,8 +23,6 @@ #define NS_SELF "/proc/self/ns/net" #define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" -static const struct timeval timeo_sec = { .tv_sec = 3 }; -static const size_t timeo_optlen = sizeof(timeo_sec); static int stop, duration; static bool @@ -73,52 +72,6 @@ configure_stack(void) return true; } -static int -start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto close_out; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int -connect_to_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = -1; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(connect(fd, addr, len))) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static in_port_t get_port(int fd) { @@ -161,7 +114,7 @@ run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) in_port_t port; int ret = 1; - client = connect_to_server(addr, len, type); + client = connect_to_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (client == -1) { perror("Cannot connect to server"); goto out; @@ -310,7 +263,9 @@ void test_sk_assign(void) continue; prepare_addr(test->addr, test->family, BIND_PORT, false); addr = (const struct sockaddr *)test->addr; - server = start_server(addr, test->len, test->type); + server = start_server_addr(test->type, + (const struct sockaddr_storage *)addr, + test->len, NULL); if (server == -1) goto close; diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 5fd617718991..61668e0f11b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -328,7 +328,7 @@ static void test_bind(struct sock_addr_test *test) goto cleanup; /* Try to connect to server just in case */ - client = connect_to_addr(&expected_addr, expected_addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &expected_addr, expected_addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -357,7 +357,7 @@ static void test_connect(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -538,7 +538,7 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 77e26ecffa9d..1337153eb0ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -131,6 +131,65 @@ out: test_skmsg_load_helpers__destroy(skel); } +static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) +{ + struct bpf_program *prog, *prog_clone, *prog_clone2; + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, opts); + struct test_skmsg_load_helpers *skel; + struct bpf_link *link, *link2; + int err, map; + + skel = test_skmsg_load_helpers__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_skmsg_load_helpers__open_and_load")) + return; + + prog = skel->progs.prog_msg_verdict; + prog_clone = skel->progs.prog_msg_verdict_clone; + prog_clone2 = skel->progs.prog_msg_verdict_clone2; + map = bpf_map__fd(skel->maps.sock_map); + + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + /* Fail since bpf_link for the same prog has been created. */ + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_ERR(err, "bpf_prog_attach")) + goto out; + + /* Fail since bpf_link for the same prog type has been created. */ + link2 = bpf_program__attach_sockmap(prog_clone, map); + if (!ASSERT_ERR_PTR(link2, "bpf_program__attach_sockmap")) { + bpf_link__detach(link2); + goto out; + } + + err = bpf_link__update_program(link, prog_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different type attempts to do update. */ + err = bpf_link__update_program(link, skel->progs.prog_skb_verdict); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + /* Fail since the old prog does not match the one in the kernel. */ + opts.old_prog_fd = bpf_program__fd(prog_clone2); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_ERR(err, "bpf_link_update")) + goto out; + + opts.old_prog_fd = bpf_program__fd(prog_clone); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "bpf_link_update")) + goto out; +out: + bpf_link__detach(link); + test_skmsg_load_helpers__destroy(skel); +} + static void test_sockmap_update(enum bpf_map_type map_type) { int err, prog, src; @@ -298,6 +357,40 @@ out: test_sockmap_skb_verdict_attach__destroy(skel); } +static void test_sockmap_skb_verdict_attach_with_link(void) +{ + struct test_sockmap_skb_verdict_attach *skel; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + skel = test_sockmap_skb_verdict_attach__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + prog = skel->progs.prog_skb_verdict; + map = bpf_map__fd(skel->maps.sock_map); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + bpf_link__detach(link); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + /* Fail since attaching with the same prog/map has been done. */ + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_sockmap")) + bpf_link__detach(link); + + err = bpf_prog_detach2(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT); + if (!ASSERT_OK(err, "bpf_prog_detach2")) + goto out; +out: + test_sockmap_skb_verdict_attach__destroy(skel); +} + static __u32 query_prog_id(int prog_fd) { struct bpf_prog_info info = {}; @@ -475,30 +568,19 @@ out: test_sockmap_drop_prog__destroy(drop); } -static void test_sockmap_skb_verdict_peek(void) +static void test_sockmap_skb_verdict_peek_helper(int map) { - int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail; - struct test_sockmap_pass_prog *pass; + int err, s, c1, p1, zero = 0, sent, recvd, avail; char snd[256] = "0123456789"; char rcv[256] = "0"; - pass = test_sockmap_pass_prog__open_and_load(); - if (!ASSERT_OK_PTR(pass, "open_and_load")) - return; - verdict = bpf_program__fd(pass->progs.prog_skb_verdict); - map = bpf_map__fd(pass->maps.sock_map_rx); - - err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach")) - goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - goto out; + return; err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); if (!ASSERT_OK(err, "create_pairs(s)")) - goto out; + return; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) @@ -520,7 +602,58 @@ static void test_sockmap_skb_verdict_peek(void) out_close: close(c1); close(p1); +} + +static void test_sockmap_skb_verdict_peek(void) +{ + struct test_sockmap_pass_prog *pass; + int err, map, verdict; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + verdict = bpf_program__fd(pass->progs.prog_skb_verdict); + map = bpf_map__fd(pass->maps.sock_map_rx); + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + +out: + test_sockmap_pass_prog__destroy(pass); +} + +static void test_sockmap_skb_verdict_peek_with_link(void) +{ + struct test_sockmap_pass_prog *pass; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + prog = pass->progs.prog_skb_verdict; + map = bpf_map__fd(pass->maps.sock_map_rx); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + err = bpf_link__update_program(link, pass->progs.prog_skb_verdict_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different attach type attempts to do update. */ + err = bpf_link__update_program(link, pass->progs.prog_skb_parser); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + ASSERT_EQ(pass->bss->clone_called, 1, "clone_called"); out: + bpf_link__detach(link); test_sockmap_pass_prog__destroy(pass); } @@ -788,6 +921,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, BPF_SK_SKB_VERDICT); } + if (test__start_subtest("sockmap skb_verdict attach_with_link")) + test_sockmap_skb_verdict_attach_with_link(); if (test__start_subtest("sockmap msg_verdict progs query")) test_sockmap_progs_query(BPF_SK_MSG_VERDICT); if (test__start_subtest("sockmap stream_parser progs query")) @@ -804,6 +939,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); + if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) + test_sockmap_skb_verdict_peek_with_link(); if (test__start_subtest("sockmap unconnected af_unix")) test_sockmap_unconnected_unix(); if (test__start_subtest("sockmap one socket to many map entries")) @@ -812,4 +949,8 @@ void test_sockmap_basic(void) test_sockmap_many_maps(); if (test__start_subtest("sockmap same socket replace")) test_sockmap_same_sock(); + if (test__start_subtest("sockmap sk_msg attach sockmap helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg attach sockhash helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index a92807bfcd13..e91b59366030 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -767,6 +767,24 @@ static void test_msg_redir_to_connected(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_connected_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int prog_msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int link_fd; + + link_fd = bpf_link_create(prog_msg_verdict, sock_map, BPF_SK_MSG_VERDICT, NULL); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + return; + + redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + close(link_fd); +} + static void redir_to_listening(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { @@ -869,6 +887,24 @@ static void test_msg_redir_to_listening(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + struct bpf_program *verdict = skel->progs.prog_msg_verdict; + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + struct bpf_link *link; + + link = bpf_program__attach_sockmap(verdict, sock_map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + return; + + redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + bpf_link__detach(link); +} + static void redir_partial(int family, int sotype, int sock_map, int parser_map) { int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; @@ -1316,7 +1352,9 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, TEST(test_skb_redir_to_listening), TEST(test_skb_redir_partial), TEST(test_msg_redir_to_connected), + TEST(test_msg_redir_to_connected_with_link), TEST(test_msg_redir_to_listening), + TEST(test_msg_redir_to_listening_with_link), }; const char *family_name, *map_name; const struct redir_test *t; diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 5db9eec24b5b..0832fd787457 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -35,7 +35,7 @@ retry: pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu 0 */, -1 /* group id */, 0 /* flags */); - if (pmu_fd < 0 && errno == ENOENT) { + if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index dbe06aeaa2b2..b1073d36d77a 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -530,7 +530,7 @@ static int wait_netstamp_needed_key(void) __u64 tstamp = 0; nstoken = open_netns(NS_DST); - if (!nstoken) + if (!ASSERT_OK_PTR(nstoken, "setns dst")) return -1; srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0); diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index 8fe84da1b9b4..f2b99d95d916 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; static void send_byte(int fd) @@ -83,6 +86,17 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, err++; } + /* Precise values of mrtt and srtt are unavailable, just make sure they are nonzero */ + if (val.mrtt_us == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[0] (mrtt_us) %u == 0", msg, val.mrtt_us); + err++; + } + + if (val.srtt == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[1] (srtt) %u == 0", msg, val.srtt); + err++; + } + return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index 098776d00ab4..7cf2b9ddd3e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -138,11 +138,35 @@ static void test_struct_ops_not_zeroed(void) struct_ops_module__destroy(skel); } +/* The signature of an implementation might not match the signature of the + * function pointer prototype defined in the BPF program. This mismatch + * should be allowed as long as the behavior of the operator program + * adheres to the signature in the kernel. Libbpf should not enforce the + * signature; rather, let the kernel verifier handle the enforcement. + */ +static void test_struct_ops_incompatible(void) +{ + struct struct_ops_module *skel; + struct bpf_link *link; + + skel = struct_ops_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + link = bpf_map__attach_struct_ops(skel->maps.testmod_incompatible); + if (ASSERT_OK_PTR(link, "attach_struct_ops")) + bpf_link__destroy(link); + + struct_ops_module__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) test_struct_ops_load(); if (test__start_subtest("test_struct_ops_not_zeroed")) test_struct_ops_not_zeroed(); + if (test__start_subtest("test_struct_ops_incompatible")) + test_struct_ops_incompatible(); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 5f1fb0a2ea56..cec746e77cd3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -612,6 +612,8 @@ static void test_ipip_tunnel(enum ipip_encap encap) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); if (!ASSERT_OK(err, "test_ping")) goto done; @@ -666,6 +668,8 @@ static void test_xfrm_tunnel(void) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); close_netns(nstoken); if (!ASSERT_OK(err, "test_ping")) diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c index 7b9124d506a5..e56e88596d64 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_printk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c @@ -5,18 +5,19 @@ #include "trace_printk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "testing,testing" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_printk(void) { struct trace_printk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_printk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_printk_lskel__open(); if (!ASSERT_OK_PTR(skel, "trace_printk__open")) @@ -35,16 +36,6 @@ void serial_test_trace_printk(void) if (!ASSERT_OK(err, "trace_printk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_printk_lskel__detach(skel); @@ -56,21 +47,12 @@ void serial_test_trace_printk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_printk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_printk_ran, "found")) goto cleanup; cleanup: trace_printk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c index 44ea2fd88f4c..2af6a6f2096a 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c @@ -5,18 +5,19 @@ #include "trace_vprintk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "1,2,3,4,5,6,7,8,9,10" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_vprintk(void) { struct trace_vprintk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_vprintk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_vprintk_lskel__open_and_load(); if (!ASSERT_OK_PTR(skel, "trace_vprintk__open_and_load")) @@ -28,16 +29,6 @@ void serial_test_trace_vprintk(void) if (!ASSERT_OK(err, "trace_vprintk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_vprintk_lskel__detach(skel); @@ -49,14 +40,8 @@ void serial_test_trace_vprintk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_vprintk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_vprintk_ran, "found")) goto cleanup; @@ -66,7 +51,4 @@ void serial_test_trace_vprintk(void) cleanup: trace_vprintk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c new file mode 100644 index 000000000000..3918ecc2ee91 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> + +#include "verifier_kfunc_prog_types.skel.h" + +void test_verifier_kfunc_prog_types(void) +{ + RUN_TESTS(verifier_kfunc_prog_types); +} diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c new file mode 100644 index 000000000000..c4bacd3160e1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires */ +#include <test_progs.h> +#include "wq.skel.h" +#include "wq_failures.skel.h" + +void serial_test_wq(void) +{ + struct wq *wq_skel = NULL; + int err, prog_fd; + + LIBBPF_OPTS(bpf_test_run_opts, topts); + + RUN_TESTS(wq); + + /* re-run the success test to check if the timer was actually executed */ + + wq_skel = wq__open_and_load(); + if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + return; + + err = wq__attach(wq_skel); + if (!ASSERT_OK(err, "wq_attach")) + return; + + prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + usleep(50); /* 10 usecs should be enough, but give it extra */ + + ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); + wq__destroy(wq_skel); +} + +void serial_test_failures_wq(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + + RUN_TESTS(wq_failures); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 05edcf32f528..f76b5d67a3ee 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -384,6 +384,8 @@ void test_xdp_metadata(void) SYS(out, "ip netns add " RX_NETNS_NAME); tok = open_netns(TX_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add numtxqueues 1 numrxqueues 1 " TX_NAME " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1"); SYS(out, "ip link set " RX_NAME " netns " RX_NETNS_NAME); @@ -400,6 +402,8 @@ void test_xdp_metadata(void) SYS(out, "ip -4 neigh add " RX_ADDR " lladdr " RX_MAC " dev " TX_NAME_VLAN); switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; SYS(out, "ip link set dev " RX_NAME " address " RX_MAC); SYS(out, "ip link set dev " RX_NAME " up"); @@ -449,6 +453,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Setup separate AF_XDP for TX interface nad send packet to the RX socket. */ tx_ifindex = if_nametoindex(TX_NAME); @@ -461,6 +467,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; /* Verify packet sent from AF_XDP has proper metadata. */ if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, true), 0, @@ -468,6 +476,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; complete_tx(&tx_xsk); /* Now check metadata of packet, generated with network stack */ @@ -475,6 +485,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, false), 0, "verify_xsk_metadata")) @@ -498,6 +510,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Send packet to trigger . */ if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0, @@ -505,6 +519,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; while (!retries--) { if (bpf_obj2->bss->called) diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c new file mode 100644 index 000000000000..55f10563208d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <stdbool.h> +#include "bpf_arena_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 10); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +__u32 pid = 0; + +#undef __arena +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) +#define __arena __attribute__((address_space(1))) +#else +#define __arena SEC(".addr_space.1") +#endif + +__u64 __arena add64_value = 1; +__u64 __arena add64_result = 0; +__u32 __arena add32_value = 1; +__u32 __arena add32_result = 0; +__u64 __arena add_stack_value_copy = 0; +__u64 __arena add_stack_result = 0; +__u64 __arena add_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int add(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 add_stack_value = 1; + + add64_result = __sync_fetch_and_add(&add64_value, 2); + add32_result = __sync_fetch_and_add(&add32_value, 2); + add_stack_result = __sync_fetch_and_add(&add_stack_value, 2); + add_stack_value_copy = add_stack_value; + __sync_fetch_and_add(&add_noreturn_value, 2); +#endif + + return 0; +} + +__s64 __arena sub64_value = 1; +__s64 __arena sub64_result = 0; +__s32 __arena sub32_value = 1; +__s32 __arena sub32_result = 0; +__s64 __arena sub_stack_value_copy = 0; +__s64 __arena sub_stack_result = 0; +__s64 __arena sub_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int sub(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 sub_stack_value = 1; + + sub64_result = __sync_fetch_and_sub(&sub64_value, 2); + sub32_result = __sync_fetch_and_sub(&sub32_value, 2); + sub_stack_result = __sync_fetch_and_sub(&sub_stack_value, 2); + sub_stack_value_copy = sub_stack_value; + __sync_fetch_and_sub(&sub_noreturn_value, 2); +#endif + + return 0; +} + +__u64 __arena and64_value = (0x110ull << 32); +__u32 __arena and32_value = 0x110; + +SEC("raw_tp/sys_enter") +int and(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + + __sync_fetch_and_and(&and64_value, 0x011ull << 32); + __sync_fetch_and_and(&and32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena or32_value = 0x110; +__u64 __arena or64_value = (0x110ull << 32); + +SEC("raw_tp/sys_enter") +int or(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_or(&or64_value, 0x011ull << 32); + __sync_fetch_and_or(&or32_value, 0x011); +#endif + + return 0; +} + +__u64 __arena xor64_value = (0x110ull << 32); +__u32 __arena xor32_value = 0x110; + +SEC("raw_tp/sys_enter") +int xor(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_xor(&xor64_value, 0x011ull << 32); + __sync_fetch_and_xor(&xor32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena cmpxchg32_value = 1; +__u32 __arena cmpxchg32_result_fail = 0; +__u32 __arena cmpxchg32_result_succeed = 0; +__u64 __arena cmpxchg64_value = 1; +__u64 __arena cmpxchg64_result_fail = 0; +__u64 __arena cmpxchg64_result_succeed = 0; + +SEC("raw_tp/sys_enter") +int cmpxchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + cmpxchg64_result_fail = __sync_val_compare_and_swap(&cmpxchg64_value, 0, 3); + cmpxchg64_result_succeed = __sync_val_compare_and_swap(&cmpxchg64_value, 1, 2); + + cmpxchg32_result_fail = __sync_val_compare_and_swap(&cmpxchg32_value, 0, 3); + cmpxchg32_result_succeed = __sync_val_compare_and_swap(&cmpxchg32_value, 1, 2); +#endif + + return 0; +} + +__u64 __arena xchg64_value = 1; +__u64 __arena xchg64_result = 0; +__u32 __arena xchg32_value = 1; +__u32 __arena xchg32_result = 0; + +SEC("raw_tp/sys_enter") +int xchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 val64 = 2; + __u32 val32 = 2; + + xchg64_result = __sync_lock_test_and_set(&xchg64_value, val64); + xchg32_result = __sync_lock_test_and_set(&xchg32_value, val32); +#endif + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h index 22914a70db54..73ba32e9a693 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h @@ -13,7 +13,7 @@ struct __cgrps_kfunc_map_value { struct cgroup __kptr * cgrp; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __cgrps_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/crypto_basic.c b/tools/testing/selftests/bpf/progs/crypto_basic.c new file mode 100644 index 000000000000..8cf7168b42d5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_basic.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +int status; +SEC("syscall") +int crypto_release(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +SEC("syscall") +__failure __msg("Unreleased reference") +int crypto_acquire(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + cctx = bpf_crypto_ctx_acquire(cctx); + if (!cctx) + return -EINVAL; + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c new file mode 100644 index 000000000000..e61fe0882293 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_bench.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +const volatile unsigned int len = 16; +char cipher[128] = {}; +u32 key_len, authsize; +char dst[256] = {}; +u8 key[256] = {}; +long hits = 0; +int status; + +SEC("syscall") +int crypto_setup(void *args) +{ + struct bpf_crypto_ctx *cctx; + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + int err = 0; + + status = 0; + + if (!cipher[0] || !key_len || key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, cipher, sizeof(cipher)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int crypto_encrypt(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return 0; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return 0; + } + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +SEC("tc") +int crypto_decrypt(struct __sk_buff *skb) +{ + struct bpf_dynptr psrc, pdst, iv; + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + + v = crypto_ctx_value_lookup(); + if (!v) + return -ENOENT; + + ctx = v->ctx; + if (!ctx) + return -ENOENT; + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/crypto_common.h b/tools/testing/selftests/bpf/progs/crypto_common.h new file mode 100644 index 000000000000..57dd7a68a8c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_common.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#ifndef _CRYPTO_COMMON_H +#define _CRYPTO_COMMON_H + +#include "errno.h" +#include <stdbool.h> + +struct bpf_crypto_ctx *bpf_crypto_ctx_create(const struct bpf_crypto_params *params, + u32 params__sz, int *err) __ksym; +struct bpf_crypto_ctx *bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) __ksym; +void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) __ksym; +int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; +int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; + +struct __crypto_ctx_value { + struct bpf_crypto_ctx __kptr * ctx; +}; + +struct array_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct __crypto_ctx_value); + __uint(max_entries, 1); +} __crypto_ctx_map SEC(".maps"); + +static inline struct __crypto_ctx_value *crypto_ctx_value_lookup(void) +{ + u32 key = 0; + + return bpf_map_lookup_elem(&__crypto_ctx_map, &key); +} + +static inline int crypto_ctx_insert(struct bpf_crypto_ctx *ctx) +{ + struct __crypto_ctx_value local, *v; + struct bpf_crypto_ctx *old; + u32 key = 0; + int err; + + local.ctx = NULL; + err = bpf_map_update_elem(&__crypto_ctx_map, &key, &local, 0); + if (err) { + bpf_crypto_ctx_release(ctx); + return err; + } + + v = bpf_map_lookup_elem(&__crypto_ctx_map, &key); + if (!v) { + bpf_crypto_ctx_release(ctx); + return -ENOENT; + } + + old = bpf_kptr_xchg(&v->ctx, ctx); + if (old) { + bpf_crypto_ctx_release(old); + return -EEXIST; + } + + return 0; +} + +#endif /* _CRYPTO_COMMON_H */ diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c new file mode 100644 index 000000000000..1be0a3fa5efd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +unsigned char key[256] = {}; +u16 udp_test_port = 7777; +u32 authsize, key_len; +char algo[128] = {}; +char dst[16] = {}; +int status; + +static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) +{ + struct ipv6hdr ip6h; + struct udphdr udph; + u32 offset; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IPV6)) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip6h, sizeof(ip6h))) + return -1; + + if (ip6h.nexthdr != IPPROTO_UDP) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(ip6h), &udph, sizeof(udph))) + return -1; + + if (udph.dest != __bpf_htons(udp_test_port)) + return -1; + + offset = ETH_HLEN + sizeof(ip6h) + sizeof(udph); + if (skb->len < offset + 16) + return -1; + + /* let's make sure that 16 bytes of payload are in the linear part of skb */ + bpf_skb_pull_data(skb, offset + 16); + bpf_dynptr_from_skb(skb, 0, psrc); + bpf_dynptr_adjust(psrc, offset, offset + 16); + + return 0; +} + +SEC("syscall") +int skb_crypto_setup(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + if (key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int decrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +SEC("tc") +int encrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + status = 0; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c index 1efa746c25dc..ec0c595d47af 100644 --- a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c +++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c @@ -11,8 +11,17 @@ int BPF_PROG(test_1, struct bpf_dummy_ops_state *state) { int ret; - if (!state) - return 0xf2f3f4f5; + /* Check that 'state' nullable status is detected correctly. + * If 'state' argument would be assumed non-null by verifier + * the code below would be deleted as dead (which it shouldn't). + * Hide it from the compiler behind 'asm' block to avoid + * unnecessary optimizations. + */ + asm volatile ( + "if %[state] != 0 goto +2;" + "r0 = 0xf2f3f4f5;" + "exit;" + ::[state]"p"(state)); ret = state->val; state->val = 0x5a; @@ -25,7 +34,7 @@ SEC("struct_ops/test_2") int BPF_PROG(test_2, struct bpf_dummy_ops_state *state, int a1, unsigned short a2, char a3, unsigned long a4) { - test_2_args[0] = (unsigned long)state; + test_2_args[0] = state->val; test_2_args[1] = a1; test_2_args[2] = a2; test_2_args[3] = a3; diff --git a/tools/testing/selftests/bpf/progs/for_each_multi_maps.c b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c new file mode 100644 index 000000000000..ff0bed7d4459 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 5); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct callback_ctx { + int output; +}; + +u32 data_output = 0; +int use_array = 0; + +static __u64 +check_map_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + return 0; +} + +SEC("tc") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + if (use_array) + bpf_for_each_map_elem(&arraymap, check_map_elem, &data, 0); + else + bpf_for_each_map_elem(&hashmap, check_map_elem, &data, 0); + data_output = data.output; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c index 53301ae8a8f7..cbdc730c3a47 100644 --- a/tools/testing/selftests/bpf/progs/mptcpify.c +++ b/tools/testing/selftests/bpf/progs/mptcpify.c @@ -6,10 +6,14 @@ #include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; +int pid; SEC("fmod_ret/update_socket_protocol") int BPF_PROG(mptcpify, int family, int type, int protocol) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return protocol; + if ((family == AF_INET || family == AF_INET6) && type == SOCK_STREAM && (!protocol || protocol == IPPROTO_TCP)) { diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c new file mode 100644 index 000000000000..672fc368d9c4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_experimental.h" + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("3 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_3(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_3_minus_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_enable(); + bpf_preempt_enable(); + return 0; +} + +static __noinline void preempt_disable(void) +{ + bpf_preempt_disable(); +} + +static __noinline void preempt_enable(void) +{ + bpf_preempt_enable(); +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_2_minus_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + preempt_enable(); + return 0; +} + +static __noinline void preempt_balance_subprog(void) +{ + preempt_disable(); + preempt_enable(); +} + +SEC("?tc") +__success int preempt_balance(struct __sk_buff *ctx) +{ + bpf_guard_preempt(); + return 0; +} + +SEC("?tc") +__success int preempt_balance_subprog_test(struct __sk_buff *ctx) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#") +int preempt_sleepable_helper(void *ctx) +{ + u32 data; + + bpf_preempt_disable(); + bpf_copy_from_user(&data, sizeof(data), NULL); + bpf_preempt_enable(); + return 0; +} + +int __noinline preempt_global_subprog(void) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?tc") +__failure __msg("global function calls are not allowed with preemption disabled") +int preempt_global_subprog_test(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_global_subprog(); + preempt_enable(); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 86e1e50c5531..63b065dae002 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -68,3 +68,16 @@ struct bpf_testmod_ops___zeroed testmod_zeroed = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, }; + +struct bpf_testmod_ops___incompatible { + int (*test_1)(void); + void (*test_2)(int *a); + int data; +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops___incompatible testmod_incompatible = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2, + .data = 3, +}; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h index 41f2d44f49cb..6720c4b5be41 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h @@ -13,7 +13,7 @@ struct __tasks_kfunc_map_value { struct task_struct __kptr * task; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __tasks_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c new file mode 100644 index 000000000000..fcfbfe0336b4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_tracing.h> + +extern void bbr_init(struct sock *sk) __ksym; +extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym; +extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; +extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; +extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern u32 bbr_ssthresh(struct sock *sk) __ksym; +extern u32 bbr_min_tso_segs(struct sock *sk) __ksym; +extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym; + +extern void dctcp_init(struct sock *sk) __ksym; +extern void dctcp_update_alpha(struct sock *sk, u32 flags) __ksym; +extern void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) __ksym; +extern u32 dctcp_ssthresh(struct sock *sk) __ksym; +extern u32 dctcp_cwnd_undo(struct sock *sk) __ksym; +extern void dctcp_state(struct sock *sk, u8 new_state) __ksym; + +extern void cubictcp_init(struct sock *sk) __ksym; +extern u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) __ksym; +extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; + +SEC("struct_ops/init") +void BPF_PROG(init, struct sock *sk) +{ + bbr_init(sk); + dctcp_init(sk); + cubictcp_init(sk); +} + +SEC("struct_ops/in_ack_event") +void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) +{ + dctcp_update_alpha(sk, flags); +} + +SEC("struct_ops/cong_control") +void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs) +{ + bbr_main(sk, rs); +} + +SEC("struct_ops/cong_avoid") +void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked) +{ + cubictcp_cong_avoid(sk, ack, acked); +} + +SEC("struct_ops/sndbuf_expand") +u32 BPF_PROG(sndbuf_expand, struct sock *sk) +{ + return bbr_sndbuf_expand(sk); +} + +SEC("struct_ops/undo_cwnd") +u32 BPF_PROG(undo_cwnd, struct sock *sk) +{ + bbr_undo_cwnd(sk); + return dctcp_cwnd_undo(sk); +} + +SEC("struct_ops/cwnd_event") +void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + bbr_cwnd_event(sk, event); + dctcp_cwnd_event(sk, event); + cubictcp_cwnd_event(sk, event); +} + +SEC("struct_ops/ssthresh") +u32 BPF_PROG(ssthresh, struct sock *sk) +{ + bbr_ssthresh(sk); + dctcp_ssthresh(sk); + return cubictcp_recalc_ssthresh(sk); +} + +SEC("struct_ops/min_tso_segs") +u32 BPF_PROG(min_tso_segs, struct sock *sk) +{ + return bbr_min_tso_segs(sk); +} + +SEC("struct_ops/set_state") +void BPF_PROG(set_state, struct sock *sk, u8 new_state) +{ + bbr_set_state(sk, new_state); + dctcp_state(sk, new_state); + cubictcp_state(sk, new_state); +} + +SEC("struct_ops/pkts_acked") +void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +SEC(".struct_ops") +struct tcp_congestion_ops tcp_ca_kfunc = { + .init = (void *)init, + .in_ack_event = (void *)in_ack_event, + .cong_control = (void *)cong_control, + .cong_avoid = (void *)cong_avoid, + .sndbuf_expand = (void *)sndbuf_expand, + .undo_cwnd = (void *)undo_cwnd, + .cwnd_event = (void *)cwnd_event, + .ssthresh = (void *)ssthresh, + .min_tso_segs = (void *)min_tso_segs, + .set_state = (void *)set_state, + .pkts_acked = (void *)pkts_acked, + .name = "tcp_ca_kfunc", +}; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c index 0988d79f1587..42c729f85524 100644 --- a/tools/testing/selftests/bpf/progs/tcp_rtt.c +++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; struct { @@ -55,5 +58,8 @@ int _sockops(struct bpf_sock_ops *ctx) storage->delivered_ce = tcp_sk->delivered_ce; storage->icsk_retransmits = tcp_sk->icsk_retransmits; + storage->mrtt_us = ctx->args[0]; + storage->srtt = ctx->args[1]; + return 1; } diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_n.c b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c new file mode 100644 index 000000000000..8669eb42dbe0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Andrea Righi <andrea.righi@canonical.com> + +#include <linux/bpf.h> +#include <sched.h> +#include <unistd.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +#define TASK_COMM_LEN 16 + +struct sample { + int pid; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +int pid = 0; +long value = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_n(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) + return 0; + + sample->pid = pid; + sample->value = value; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + + bpf_ringbuf_submit(sample, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c index 45e8fc75a739..996b177324ba 100644 --- a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -24,8 +24,7 @@ struct { __type(value, __u64); } socket_storage SEC(".maps"); -SEC("sk_msg") -int prog_msg_verdict(struct sk_msg_md *msg) +static int prog_msg_verdict_common(struct sk_msg_md *msg) { struct task_struct *task = (struct task_struct *)bpf_get_current_task(); int verdict = SK_PASS; @@ -44,4 +43,28 @@ int prog_msg_verdict(struct sk_msg_md *msg) return verdict; } +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_msg") +int prog_msg_verdict_clone(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_msg") +int prog_msg_verdict_clone2(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 1d86a717a290..69aacc96db36 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -23,10 +23,25 @@ struct { __type(value, int); } sock_map_msg SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/stream_verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_PASS; } +int clone_called; + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_clone(struct __sk_buff *skb) +{ + clone_called = 1; + return SK_PASS; +} + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c index 3c69aa971738..d25b0bb30fc0 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c @@ -9,7 +9,7 @@ struct { __type(value, __u64); } sock_map SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_DROP; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 42ec202015ed..2619ed193c65 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Facebook - #include <linux/bpf.h> #include <asm/unistd.h> #include <bpf/bpf_helpers.h> @@ -26,79 +25,108 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } -SEC("tp/syscalls/sys_enter_getpgid") -int bench_trigger_tp(void *ctx) +SEC("?uprobe") +int bench_trigger_uprobe(void *ctx) { inc_counter(); return 0; } -SEC("raw_tp/sys_enter") -int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) +const volatile int batch_iters = 0; + +SEC("?raw_tp") +int trigger_count(void *ctx) { - if (id == __NR_getpgid) + int i; + + for (i = 0; i < batch_iters; i++) inc_counter(); + + return 0; +} + +SEC("?raw_tp") +int trigger_driver(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_get_numa_node_id(); /* attach point for benchmarking */ + return 0; } -SEC("kprobe/" SYS_PREFIX "sys_getpgid") +extern int bpf_modify_return_test_tp(int nonce) __ksym __weak; + +SEC("?raw_tp") +int trigger_driver_kfunc(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_modify_return_test_tp(0); /* attach point for benchmarking */ + + return 0; +} + +SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { inc_counter(); return 0; } -SEC("kretprobe/" SYS_PREFIX "sys_getpgid") +SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { inc_counter(); return 0; } -SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid") +SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { inc_counter(); return 0; } -SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid") +SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { inc_counter(); return 0; } -SEC("fentry/" SYS_PREFIX "sys_getpgid") +SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { inc_counter(); return 0; } -SEC("fexit/" SYS_PREFIX "sys_getpgid") +SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { inc_counter(); return 0; } -SEC("fentry.s/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fentry_sleep(void *ctx) +SEC("?fmod_ret/bpf_modify_return_test_tp") +int bench_trigger_fmodret(void *ctx) { inc_counter(); - return 0; + return -22; } -SEC("fmod_ret/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fmodret(void *ctx) +SEC("?tp/bpf_test_run/bpf_trigger_tp") +int bench_trigger_tp(void *ctx) { inc_counter(); - return -22; + return 0; } -SEC("uprobe") -int bench_trigger_uprobe(void *ctx) +SEC("?raw_tp/bpf_trigger_tp") +int bench_trigger_rawtp(void *ctx) { inc_counter(); return 0; diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 0ede0ccd090c..059aa716e3d0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -30,7 +30,7 @@ struct { SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_kprobe_1(void) { asm volatile (" \ @@ -44,7 +44,7 @@ __naked void in_bpf_prog_type_kprobe_1(void) SEC("tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_tracepoint_1(void) { asm volatile (" \ @@ -58,7 +58,7 @@ __naked void in_bpf_prog_type_tracepoint_1(void) SEC("perf_event") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_perf_event_1(void) { asm volatile (" \ @@ -72,7 +72,7 @@ __naked void bpf_prog_type_perf_event_1(void) SEC("raw_tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_raw_tracepoint_1(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c new file mode 100644 index 000000000000..cb32b0cfc84b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_helpers.h> + +#include "bpf_misc.h" +#include "cgrp_kfunc_common.h" +#include "cpumask_common.h" +#include "task_kfunc_common.h" + +char _license[] SEC("license") = "GPL"; + +/*************** + * Task kfuncs * + ***************/ + +static void task_kfunc_load_test(void) +{ + struct task_struct *current, *ref_1, *ref_2; + + current = bpf_get_current_task_btf(); + ref_1 = bpf_task_from_pid(current->pid); + if (!ref_1) + return; + + ref_2 = bpf_task_acquire(ref_1); + if (ref_2) + bpf_task_release(ref_2); + bpf_task_release(ref_1); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(task_kfunc_raw_tp) +{ + task_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(task_kfunc_syscall) +{ + task_kfunc_load_test(); + return 0; +} + +/***************** + * cgroup kfuncs * + *****************/ + +static void cgrp_kfunc_load_test(void) +{ + struct cgroup *cgrp, *ref; + + cgrp = bpf_cgroup_from_id(0); + if (!cgrp) + return; + + ref = bpf_cgroup_acquire(cgrp); + if (!ref) { + bpf_cgroup_release(cgrp); + return; + } + + bpf_cgroup_release(ref); + bpf_cgroup_release(cgrp); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cgrp_kfunc_raw_tp) +{ + cgrp_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cgrp_kfunc_syscall) +{ + cgrp_kfunc_load_test(); + return 0; +} + +/****************** + * cpumask kfuncs * + ******************/ + +static void cpumask_kfunc_load_test(void) +{ + struct bpf_cpumask *alloc, *ref; + + alloc = bpf_cpumask_create(); + if (!alloc) + return; + + ref = bpf_cpumask_acquire(alloc); + bpf_cpumask_set_cpu(0, alloc); + bpf_cpumask_test_cpu(0, (const struct cpumask *)ref); + + bpf_cpumask_release(ref); + bpf_cpumask_release(alloc); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cpumask_kfunc_raw_tp) +{ + cpumask_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cpumask_kfunc_syscall) +{ + cpumask_kfunc_load_test(); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 6f5d19665cf6..4a58e0398e72 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -6,6 +6,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" +#include <../../../tools/include/linux/filter.h> #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) @@ -76,6 +77,94 @@ __naked int subprog_result_precise(void) ); } +__naked __noinline __used +static unsigned long fp_leaking_subprog() +{ + asm volatile ( + ".8byte %[r0_eq_r10_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r10_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_10, 8)) + ); +} + +__naked __noinline __used +static unsigned long sneaky_fp_leaking_subprog() +{ + asm volatile ( + "r1 = r10;" + ".8byte %[r0_eq_r1_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r1_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_1, 8)) + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") +__msg("7: R0_w=scalar") +__naked int fp_precise_subprog_result(void) +{ + asm volatile ( + "call fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 11: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") +/* here r1 is marked precise, even though it's fp register, but that's fine + * because by the time we get out of subprogram it has to be derived from r10 + * anyways, at which point we'll break precision chain + */ +__msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") +__msg("7: R0_w=scalar") +__naked int sneaky_fp_precise_subprog_result(void) +{ + asm volatile ( + "call sneaky_fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + SEC("?raw_tp") __success __log_level(2) __msg("9: (0f) r1 += r0") diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c new file mode 100644 index 000000000000..49e712acbf60 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct hmap_elem { + int counter; + struct bpf_timer timer; /* unused */ + struct bpf_spin_lock lock; /* unused */ + struct bpf_wq work; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap_malloc SEC(".maps"); + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +__u32 ok; +__u32 ok_sleepable; + +static int test_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) +{ + struct elem init = {}, *val; + struct bpf_wq *wq; + + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + + if (map == &lru && + bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + wq = &val->w; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + + if (bpf_wq_start(wq, 0)) + return -5; + + return 0; +} + +static int test_hmap_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) +{ + struct hmap_elem init = {}, *val; + struct bpf_wq *wq; + + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + + if (bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + wq = &val->work; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + + if (bpf_wq_start(wq, 0)) + return -5; + + return 0; +} + +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + ok |= (1 << *key); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + ok_sleepable |= (1 << *key); + return 0; +} + +SEC("tc") +/* test that workqueues can be used from an array */ +__retval(0) +long test_call_array_sleepable(void *ctx) +{ + int key = 0; + + return test_elem_callback(&array, &key, wq_cb_sleepable); +} + +SEC("syscall") +/* Same test than above but from a sleepable context. */ +__retval(0) +long test_syscall_array_sleepable(void *ctx) +{ + int key = 1; + + return test_elem_callback(&array, &key, wq_cb_sleepable); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap */ +__retval(0) +long test_call_hash_sleepable(void *ctx) +{ + int key = 2; + + return test_hmap_elem_callback(&hmap, &key, wq_callback); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap with NO_PREALLOC. */ +__retval(0) +long test_call_hash_malloc_sleepable(void *ctx) +{ + int key = 3; + + return test_hmap_elem_callback(&hmap_malloc, &key, wq_callback); +} + +SEC("tc") +/* test that workqueues can be used from a LRU map */ +__retval(0) +long test_call_lru_sleepable(void *ctx) +{ + int key = 4; + + return test_elem_callback(&lru, &key, wq_callback); +} diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c new file mode 100644 index 000000000000..4cbdb425f223 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + return 0; +} + +SEC("tc") +/* test that bpf_wq_init takes a map as a second argument + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("pointer in R2 isn't map pointer") +long test_wq_init_nomap(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &key, 0) != 0) + return -3; + + return 0; +} + +SEC("tc") +/* test that the workqueue is part of the map in bpf_wq_init + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("workqueue pointer in R1 map_uid=0 doesn't match map pointer in R2 map_uid=0") +long test_wq_init_wrong_map(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &lru, 0) != 0) + return -3; + + return 0; +} + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("arg#0 doesn't point to a map value") +long test_wrong_wq_pointer(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)&wq, wq_callback, 0)) + return 3; + + return -22; +} + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") +long test_wrong_wq_pointer_offset(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)wq + 1, wq_cb_sleepable, 0)) + return 3; + + return -22; +} diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 80c42583f597..c412de84b88f 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -19,6 +19,7 @@ #include <bpf/libbpf.h> #include "cgroup_helpers.h" +#include "network_helpers.h" #include "testing_helpers.h" #include "bpf_util.h" @@ -604,44 +605,6 @@ static struct sock_addr_test tests[] = { }, }; -static int mk_sockaddr(int domain, const char *ip, unsigned short port, - struct sockaddr *addr, socklen_t addr_len) -{ - struct sockaddr_in6 *addr6; - struct sockaddr_in *addr4; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - return -1; - } - - memset(addr, 0, addr_len); - - if (domain == AF_INET) { - if (addr_len < sizeof(struct sockaddr_in)) - return -1; - addr4 = (struct sockaddr_in *)addr; - addr4->sin_family = domain; - addr4->sin_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) { - log_err("Invalid IPv4: %s", ip); - return -1; - } - } else if (domain == AF_INET6) { - if (addr_len < sizeof(struct sockaddr_in6)) - return -1; - addr6 = (struct sockaddr_in6 *)addr; - addr6->sin6_family = domain; - addr6->sin6_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) { - log_err("Invalid IPv6: %s", ip); - return -1; - } - } - - return 0; -} - static int load_insns(const struct sock_addr_test *test, const struct bpf_insn *insns, size_t insns_cnt) { @@ -756,9 +719,9 @@ static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) return -1; } - if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr *)&dst4_rw_addr, - sizeof(dst4_rw_addr)) == -1) + if (make_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + (struct sockaddr_storage *)&dst4_rw_addr, + NULL) == -1) return -1; struct bpf_insn insns[] = { @@ -819,9 +782,9 @@ static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, return -1; } - if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, - (struct sockaddr *)&dst6_rw_addr, - sizeof(dst6_rw_addr)) == -1) + if (make_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, + (struct sockaddr_storage *)&dst6_rw_addr, + NULL) == -1) return -1; struct bpf_insn insns[] = { @@ -939,69 +902,6 @@ static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); } -static int start_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int fd; - - fd = socket(addr->ss_family, type, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (bind(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (type == SOCK_STREAM) { - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - } - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int connect_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Fail to connect to server"); - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - int init_pktinfo(int domain, struct cmsghdr *cmsg) { struct in6_pktinfo *pktinfo6; @@ -1146,19 +1046,17 @@ static int init_addrs(const struct sock_addr_test *test, struct sockaddr_storage *expected_addr, struct sockaddr_storage *expected_src_addr) { - socklen_t addr_len = sizeof(struct sockaddr_storage); - - if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port, - (struct sockaddr *)expected_addr, addr_len) == -1) + if (make_sockaddr(test->domain, test->expected_ip, test->expected_port, + expected_addr, NULL) == -1) goto err; - if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port, - (struct sockaddr *)requested_addr, addr_len) == -1) + if (make_sockaddr(test->domain, test->requested_ip, test->requested_port, + requested_addr, NULL) == -1) goto err; if (test->expected_src_ip && - mk_sockaddr(test->domain, test->expected_src_ip, 0, - (struct sockaddr *)expected_src_addr, addr_len) == -1) + make_sockaddr(test->domain, test->expected_src_ip, 0, + expected_src_addr, NULL) == -1) goto err; return 0; @@ -1178,7 +1076,7 @@ static int run_bind_test_case(const struct sock_addr_test *test) if (init_addrs(test, &requested_addr, &expected_addr, NULL)) goto err; - servfd = start_server(test->type, &requested_addr, addr_len); + servfd = start_server_addr(test->type, &requested_addr, addr_len, NULL); if (servfd == -1) goto err; @@ -1186,7 +1084,7 @@ static int run_bind_test_case(const struct sock_addr_test *test) goto err; /* Try to connect to server just in case */ - clientfd = connect_to_server(test->type, &expected_addr, addr_len); + clientfd = connect_to_addr(test->type, &expected_addr, addr_len, NULL); if (clientfd == -1) goto err; @@ -1214,11 +1112,11 @@ static int run_connect_test_case(const struct sock_addr_test *test) goto err; /* Prepare server to connect to */ - servfd = start_server(test->type, &expected_addr, addr_len); + servfd = start_server_addr(test->type, &expected_addr, addr_len, NULL); if (servfd == -1) goto err; - clientfd = connect_to_server(test->type, &requested_addr, addr_len); + clientfd = connect_to_addr(test->type, &requested_addr, addr_len, NULL); if (clientfd == -1) goto err; @@ -1271,7 +1169,7 @@ static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg) goto err; /* Prepare server to sendmsg to */ - servfd = start_server(test->type, &server_addr, addr_len); + servfd = start_server_addr(test->type, &server_addr, addr_len, NULL); if (servfd == -1) goto err; diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 024a0faafb3b..43612de44fbf 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -2104,9 +2104,9 @@ out: free(options.whitelist); if (options.blacklist) free(options.blacklist); + close(cg_fd); if (cg_created) cleanup_cgroup_environment(); - close(cg_fd); return err; } diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 28b6646662af..d5379a0e6da8 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -368,9 +368,23 @@ int delete_module(const char *name, int flags) int unload_bpf_testmod(bool verbose) { + int ret, cnt = 0; + if (kern_sync_rcu()) fprintf(stdout, "Failed to trigger kernel-side RCU sync!\n"); - if (delete_module("bpf_testmod", 0)) { + + for (;;) { + ret = delete_module("bpf_testmod", 0); + if (!ret || errno != EAGAIN) + break; + if (++cnt > 10000) { + fprintf(stdout, "Unload of bpf_testmod timed out\n"); + break; + } + usleep(100); + } + + if (ret) { if (errno == ENOENT) { if (verbose) fprintf(stdout, "bpf_testmod.ko is already unloaded.\n"); diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 27fd7ed3e4b0..70e29f316fe7 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -61,12 +61,7 @@ void free_kallsyms_local(struct ksyms *ksyms) free(ksyms); } -static int ksym_cmp(const void *p1, const void *p2) -{ - return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; -} - -struct ksyms *load_kallsyms_local(void) +static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb) { FILE *f; char func[256], buf[256]; @@ -100,7 +95,7 @@ struct ksyms *load_kallsyms_local(void) goto error; } fclose(f); - qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), ksym_cmp); + qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb); return ksyms; error: @@ -109,6 +104,21 @@ error: return NULL; } +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +struct ksyms *load_kallsyms_local(void) +{ + return load_kallsyms_local_common(ksym_cmp); +} + +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb) +{ + return load_kallsyms_local_common(cmp_cb); +} + int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); @@ -148,6 +158,28 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key) return &ksyms->syms[0]; } +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p, + ksym_search_cmp_t cmp_cb) +{ + int start = 0, mid, end = ksyms->sym_cnt; + struct ksym *ks; + int result; + + while (start < end) { + mid = start + (end - start) / 2; + ks = &ksyms->syms[mid]; + result = cmp_cb(p, ks); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return ks; + } + + return NULL; +} + struct ksym *ksym_search(long key) { if (!ksyms) @@ -201,29 +233,6 @@ out: return err; } -void read_trace_pipe(void) -{ - int trace_fd; - - if (access(TRACEFS_PIPE, F_OK) == 0) - trace_fd = open(TRACEFS_PIPE, O_RDONLY, 0); - else - trace_fd = open(DEBUGFS_PIPE, O_RDONLY, 0); - if (trace_fd < 0) - return; - - while (1) { - static char buf[4096]; - ssize_t sz; - - sz = read(trace_fd, buf, sizeof(buf) - 1); - if (sz > 0) { - buf[sz] = 0; - puts(buf); - } - } -} - ssize_t get_uprobe_offset(const void *addr) { size_t start, end, base; @@ -381,3 +390,43 @@ out: close(fd); return err; } + +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), void *data, int iter) +{ + size_t buflen, n; + char *buf = NULL; + FILE *fp = NULL; + + if (access(TRACEFS_PIPE, F_OK) == 0) + fp = fopen(TRACEFS_PIPE, "r"); + else + fp = fopen(DEBUGFS_PIPE, "r"); + if (!fp) + return -1; + + /* We do not want to wait forever when iter is specified. */ + if (iter) + fcntl(fileno(fp), F_SETFL, O_NONBLOCK); + + while ((n = getline(&buf, &buflen, fp) >= 0) || errno == EAGAIN) { + if (n > 0) + cb(buf, data); + if (iter && !(--iter)) + break; + } + + free(buf); + if (fp) + fclose(fp); + return 0; +} + +static void trace_pipe_cb(const char *str, void *data) +{ + printf("%s", str); +} + +void read_trace_pipe(void) +{ + read_trace_pipe_iter(trace_pipe_cb, NULL, 0); +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 04fd1da7079d..2ce873c9f9aa 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -13,6 +13,9 @@ struct ksym { }; struct ksyms; +typedef int (*ksym_cmp_t)(const void *p1, const void *p2); +typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2); + int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); @@ -22,10 +25,16 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key); long ksym_get_addr_local(struct ksyms *ksyms, const char *name); void free_kallsyms_local(struct ksyms *ksyms); +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb); +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1, + ksym_search_cmp_t cmp_cb); + /* open kallsyms and find addresses on the fly, faster than load + search. */ int kallsyms_find(const char *sym, unsigned long long *addr); void read_trace_pipe(void); +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), + void *data, int iter); ssize_t get_uprobe_offset(const void *addr); ssize_t get_rel_offset(uintptr_t addr); diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c index a61ceab60b68..7ffa563ffeba 100644 --- a/tools/testing/selftests/bpf/uprobe_multi.c +++ b/tools/testing/selftests/bpf/uprobe_multi.c @@ -9,7 +9,7 @@ #define NAME(name, idx) PASTE(name, idx) -#define DEF(name, idx) int NAME(name, idx)(void) { return 0; } +#define DEF(name, idx) int __attribute__((weak)) NAME(name, idx)(void) { return 0; } #define CALL(name, idx) NAME(name, idx)(); #define F(body, name, idx) body(name, idx) diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index bdf5d8180067..0859fe727da7 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -495,20 +495,6 @@ peek: return 0; } -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ - static int rxq_num(const char *ifname) { struct ethtool_channels ch = { diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index b1102ee13faa..2eac0895b0a1 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -81,6 +81,7 @@ #include <linux/mman.h> #include <linux/netdev.h> #include <linux/bitmap.h> +#include <linux/ethtool.h> #include <arpa/inet.h> #include <net/if.h> #include <locale.h> @@ -105,11 +106,15 @@ #include "../kselftest.h" #include "xsk_xdp_common.h" +#include <network_helpers.h> + static bool opt_verbose; static bool opt_print_tests; static enum test_mode opt_mode = TEST_MODE_ALL; static u32 opt_run_test = RUN_ALL_TESTS; +void test__fail(void) { /* for network_helpers.c */ } + static void __exit_with_error(int error, const char *file, const char *func, int line) { ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, @@ -239,7 +244,7 @@ static void enable_busy_poll(struct xsk_socket_info *xsk) (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); - sock_opt = BATCH_SIZE; + sock_opt = xsk->batch_size; if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); @@ -409,6 +414,33 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj } } +static int set_ring_size(struct ifobject *ifobj) +{ + int ret; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); + if (!ret) + break; + + /* Retry if it fails */ + if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) + return -errno; + + usleep(USLEEP_MAX); + } + + return ret; +} + +static int hw_ring_size_reset(struct ifobject *ifobj) +{ + ifobj->ring.tx_pending = ifobj->set_ring.default_tx; + ifobj->ring.rx_pending = ifobj->set_ring.default_rx; + return set_ring_size(ifobj); +} + static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, struct ifobject *ifobj_rx) { @@ -439,6 +471,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, for (j = 0; j < MAX_SOCKETS; j++) { memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; if (i == 0) ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; else @@ -451,12 +484,16 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + test->ifobj_tx = ifobj_tx; test->ifobj_rx = ifobj_rx; test->current_step = 0; test->total_steps = 1; test->nb_sockets = 1; test->fail = false; + test->set_ring = false; test->mtu = MAX_ETH_PKT_SIZE; test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; @@ -1087,7 +1124,7 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) return TEST_CONTINUE; } - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); if (!rcvd) return TEST_CONTINUE; @@ -1239,7 +1276,8 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); /* pkts_in_flight might be negative if many invalid packets are sent */ - if (pkts_in_flight >= (int)((umem_size(umem) - BATCH_SIZE * buffer_len) / buffer_len)) { + if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / + buffer_len)) { ret = kick_tx(xsk); if (ret) return TEST_FAILURE; @@ -1249,7 +1287,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b fds.fd = xsk_socket__fd(xsk->xsk); fds.events = POLLOUT; - while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) { + while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { if (use_poll) { ret = poll(&fds, 1, POLL_TMOUT); if (timeout) { @@ -1269,10 +1307,10 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b } } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } - for (i = 0; i < BATCH_SIZE; i++) { + for (i = 0; i < xsk->batch_size; i++) { struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); u32 nb_frags_left, nb_frags, bytes_written = 0; @@ -1280,9 +1318,9 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b break; nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); - if (nb_frags > BATCH_SIZE - i) { + if (nb_frags > xsk->batch_size - i) { pkt_stream_cancel(pkt_stream); - xsk_ring_prod__cancel(&xsk->tx, BATCH_SIZE - i); + xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); break; } nb_frags_left = nb_frags; @@ -1370,7 +1408,7 @@ static int wait_for_tx_completion(struct xsk_socket_info *xsk) return TEST_FAILURE; } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } return TEST_PASS; @@ -1860,6 +1898,14 @@ static int testapp_validate_traffic(struct test_spec *test) return TEST_SKIP; } + if (test->set_ring) { + if (ifobj_tx->hw_ring_size_supp) + return set_ring_size(ifobj_tx); + + ksft_test_result_skip("Changing HW ring size not supported.\n"); + return TEST_SKIP; + } + xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx); return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); } @@ -2373,6 +2419,50 @@ static int testapp_xdp_metadata_mb(struct test_spec *test) return testapp_xdp_metadata_copy(test); } +static int testapp_hw_sw_min_ring_size(struct test_spec *test) +{ + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; + test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; + test->ifobj_tx->xsk->batch_size = 1; + test->ifobj_rx->xsk->batch_size = 1; + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch size to hw_ring_size - 1 */ + test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + return testapp_validate_traffic(test); +} + +static int testapp_hw_sw_max_ring_size(struct test_spec *test) +{ + u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; + test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; + test->ifobj_rx->umem->num_frames = max_descs; + test->ifobj_rx->xsk->rxqsize = max_descs; + test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch_size to 4095 */ + test->ifobj_tx->xsk->batch_size = max_descs - 1; + test->ifobj_rx->xsk->batch_size = max_descs - 1; + return testapp_validate_traffic(test); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2477,7 +2567,9 @@ static const struct test_spec tests[] = { {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, -}; + {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, + {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, + }; static void print_tests(void) { @@ -2497,6 +2589,7 @@ int main(int argc, char **argv) int modes = TEST_MODE_SKB + 1; struct test_spec test; bool shared_netdev; + int ret; /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); @@ -2534,6 +2627,13 @@ int main(int argc, char **argv) modes++; } + ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring); + if (!ret) { + ifobj_tx->hw_ring_size_supp = true; + ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending; + ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; + } + init_iface(ifobj_rx, worker_testapp_validate_rx); init_iface(ifobj_tx, worker_testapp_validate_tx); @@ -2581,6 +2681,9 @@ int main(int argc, char **argv) } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + pkt_stream_delete(tx_pkt_stream_default); pkt_stream_delete(rx_pkt_stream_default); xsk_unload_xdp_programs(ifobj_tx); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index f174df2d693f..906de5fab7a3 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -44,7 +44,7 @@ #define MAX_ETH_JUMBO_SIZE 9000 #define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define BATCH_SIZE 64 +#define DEFAULT_BATCH_SIZE 64 #define POLL_TMOUT 1000 #define THREAD_TMOUT 3 #define DEFAULT_PKT_CNT (4 * 1024) @@ -91,6 +91,7 @@ struct xsk_socket_info { struct pkt_stream *pkt_stream; u32 outstanding_tx; u32 rxqsize; + u32 batch_size; u8 dst_mac[ETH_ALEN]; u8 src_mac[ETH_ALEN]; }; @@ -113,6 +114,11 @@ struct pkt_stream { bool verbatim; }; +struct set_hw_ring { + u32 default_tx; + u32 default_rx; +}; + struct ifobject; struct test_spec; typedef int (*validation_func_t)(struct ifobject *ifobj); @@ -129,6 +135,8 @@ struct ifobject { struct xsk_xdp_progs *xdp_progs; struct bpf_map *xskmap; struct bpf_program *xdp_prog; + struct ethtool_ringparam ring; + struct set_hw_ring set_ring; enum test_mode mode; int ifindex; int mtu; @@ -145,6 +153,7 @@ struct ifobject { bool unaligned_supp; bool multi_buff_supp; bool multi_buff_zc_supp; + bool hw_ring_size_supp; }; struct test_spec { @@ -162,6 +171,7 @@ struct test_spec { u16 current_step; u16 nb_sockets; bool fail; + bool set_ring; enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; |