diff options
Diffstat (limited to 'samples')
26 files changed, 2066 insertions, 269 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 9b4a66e3363e..6a9321ec348a 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -40,6 +40,7 @@ hostprogs-y += per_socket_stats_example hostprogs-y += load_sock_ops hostprogs-y += xdp_redirect hostprogs-y += xdp_redirect_map +hostprogs-y += xdp_redirect_cpu hostprogs-y += xdp_monitor hostprogs-y += syscall_tp @@ -85,6 +86,7 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o +xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o @@ -128,8 +130,10 @@ always += tcp_bufs_kern.o always += tcp_cong_kern.o always += tcp_iw_kern.o always += tcp_clamp_kern.o +always += tcp_basertt_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o +always += xdp_redirect_cpu_kern.o always += xdp_monitor_kern.o always += syscall_tp_kern.o @@ -170,6 +174,7 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf HOSTLOADLIBES_test_map_in_map += -lelf HOSTLOADLIBES_xdp_redirect += -lelf HOSTLOADLIBES_xdp_redirect_map += -lelf +HOSTLOADLIBES_xdp_redirect_cpu += -lelf HOSTLOADLIBES_xdp_monitor += -lelf HOSTLOADLIBES_syscall_tp += -lelf @@ -178,6 +183,12 @@ HOSTLOADLIBES_syscall_tp += -lelf LLC ?= llc CLANG ?= clang +# Detect that we're cross compiling and use the cross compiler +ifdef CROSS_COMPILE +HOSTCC = $(CROSS_COMPILE)gcc +CLANG_ARCH_ARGS = -target $(ARCH) +endif + # Trick to allow make to be run from this directory all: $(MAKE) -C ../../ $(CURDIR)/ @@ -225,9 +236,9 @@ $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h $(obj)/%.o: $(src)/%.c $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \ -I$(srctree)/tools/testing/selftests/bpf/ \ - -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ - -Wno-compare-distinct-pointer-types \ + -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \ + -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ - -Wno-unknown-warning-option \ + -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index 79f9a58f1872..5f27e4faca50 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -64,3 +64,13 @@ It is also possible to point make to the newly compiled 'llc' or 'clang' command via redefining LLC or CLANG on the make command line:: make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang + +Cross compiling samples +----------------------- +In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH +environment variables before calling make. This will direct make to build +samples for the cross target. + +export ARCH=arm64 +export CROSS_COMPILE="aarch64-linux-gnu-" +make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 2325d7ad76df..522ca9252d6c 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -222,6 +222,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, + maps[i].name, maps[i].def.key_size, inner_map_fd, maps[i].def.max_entries, @@ -229,6 +230,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, numa_node); } else { map_fd[i] = bpf_create_map_node(maps[i].def.type, + maps[i].name, maps[i].def.key_size, maps[i].def.value_size, maps[i].def.max_entries, diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c index 09afaddfc9ba..f3bca3ade0f3 100644 --- a/samples/bpf/cgroup_helpers.c +++ b/samples/bpf/cgroup_helpers.c @@ -57,7 +57,7 @@ int setup_cgroup_environment(void) return 1; } - if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) { + if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL) && errno != EBUSY) { log_err("mount cgroup2"); return 1; } @@ -164,7 +164,7 @@ int create_and_get_cgroup(char *path) format_cgroup_path(cgroup_path, path); if (mkdir(cgroup_path, 0777) && errno != EEXIST) { - log_err("mkdiring cgroup"); + log_err("mkdiring cgroup %s .. %s", path, cgroup_path); return 0; } diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 098c857f1eda..2b2ffb97018b 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -266,7 +266,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx) return 0; } -SEC("kprobe/sys_getpgrp") +SEC("kprobe/sys_getppid") int stress_array_map_lookup(struct pt_regs *ctx) { u32 key = 1, i; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index f388254896f6..519d9af4b04a 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -137,6 +137,7 @@ static void do_test_lru(enum test_type test, int cpu) inner_lru_map_fds[cpu] = bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, + test_map_names[INNER_LRU_HASH_PREALLOC], sizeof(uint32_t), sizeof(long), inner_lru_hash_size, 0, @@ -282,7 +283,7 @@ static void test_array_lookup(int cpu) start_time = time_get_ns(); for (i = 0; i < max_cnt; i++) - syscall(__NR_getpgrp, 0); + syscall(__NR_getppid, 0); printf("%d:array_lookup %lld lookups per sec\n", cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); } diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c index a3cb91ebf4e7..9169d3207f18 100644 --- a/samples/bpf/syscall_tp_user.c +++ b/samples/bpf/syscall_tp_user.c @@ -23,6 +23,13 @@ * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. */ +static void usage(const char *cmd) +{ + printf("USAGE: %s [-i num_progs] [-h]\n", cmd); + printf(" -i num_progs # number of progs of the test\n"); + printf(" -h # help\n"); +} + static void verify_map(int map_id) { __u32 key = 0; @@ -32,22 +39,29 @@ static void verify_map(int map_id) fprintf(stderr, "map_lookup failed: %s\n", strerror(errno)); return; } - if (val == 0) + if (val == 0) { fprintf(stderr, "failed: map #%d returns value 0\n", map_id); + return; + } + val = 0; + if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) { + fprintf(stderr, "map_update failed: %s\n", strerror(errno)); + return; + } } -int main(int argc, char **argv) +static int test(char *filename, int num_progs) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - char filename[256]; - int fd; + int i, fd, map0_fds[num_progs], map1_fds[num_progs]; - setrlimit(RLIMIT_MEMLOCK, &r); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (load_bpf_file(filename)) { - fprintf(stderr, "%s", bpf_log_buf); - return 1; + for (i = 0; i < num_progs; i++) { + if (load_bpf_file(filename)) { + fprintf(stderr, "%s", bpf_log_buf); + return 1; + } + printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]); + map0_fds[i] = map_fd[0]; + map1_fds[i] = map_fd[1]; } /* current load_bpf_file has perf_event_open default pid = -1 @@ -64,8 +78,34 @@ int main(int argc, char **argv) close(fd); /* verify the map */ - verify_map(map_fd[0]); - verify_map(map_fd[1]); + for (i = 0; i < num_progs; i++) { + verify_map(map0_fds[i]); + verify_map(map1_fds[i]); + } return 0; } + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int opt, num_progs = 1; + char filename[256]; + + while ((opt = getopt(argc, argv, "i:h")) != -1) { + switch (opt) { + case 'i': + num_progs = atoi(optarg); + break; + case 'h': + default: + usage(argv[0]); + return 0; + } + } + + setrlimit(RLIMIT_MEMLOCK, &r); + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + return test(filename, num_progs); +} diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c new file mode 100644 index 000000000000..4bf4fc597db9 --- /dev/null +++ b/samples/bpf/tcp_basertt_kern.c @@ -0,0 +1,78 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set base_rtt to 80us when host is running TCP-NV and + * both hosts are in the same datacenter (as determined by IPv6 prefix). + * + * Use load_sock_ops to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char cong[20]; + char nv[] = "nv"; + int rv = 0, n; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check if both hosts are in the same datacenter. For this + * example they are if the 1st 5.5 bytes in the IPv6 address + * are the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_BASE_RTT: + n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) { + /* Set base_rtt to 80us */ + rv = 80; + } else if (n) { + rv = n; + } else { + rv = -1; + } + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_bbf.readme b/samples/bpf/tcp_bbf.readme new file mode 100644 index 000000000000..831fb601e3c9 --- /dev/null +++ b/samples/bpf/tcp_bbf.readme @@ -0,0 +1,26 @@ +This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) +programs. These programs attach to a cgroupv2. The following commands create +a cgroupv2 and attach a bash shell to the group. + + mkdir -p /tmp/cgroupv2 + mount -t cgroup2 none /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2/foo + bash + echo $$ >> /tmp/cgroupv2/foo/cgroup.procs + +Anything that runs under this shell belongs to the foo cgroupv2 To load +(attach) one of the tcp_*_kern.o programs: + + ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o + +If the "-l" flag is used, the load_sock_ops program will continue to run +printing the BPF log buffer. The tcp_*_kern.o programs use special print +functions to print logging information (if enabled by the ifdef). + +If using netperf/netserver to create traffic, you need to run them under the +cgroupv2 to which the BPF programs are attached (i.e. under bash shell +attached to the cgroupv2). + +To remove (unattach) a socket_ops BPF program from a cgroupv2: + + ./load_sock_ops -r /tmp/cgroupv2/foo diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 3049b1f26267..3e8232cc04a8 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -30,7 +30,7 @@ #define FOO "/foo" #define BAR "/foo/bar/" -#define PING_CMD "ping -c1 -w1 127.0.0.1" +#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null" char bpf_log_buf[BPF_LOG_BUF_SIZE]; @@ -55,8 +55,7 @@ static int prog_load(int verdict) return ret; } - -int main(int argc, char **argv) +static int test_foo_bar(void) { int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0; @@ -189,8 +188,223 @@ out: close(bar); cleanup_cgroup_environment(); if (!rc) - printf("PASS\n"); + printf("### override:PASS\n"); else - printf("FAIL\n"); + printf("### override:FAIL\n"); return rc; } + +static int map_fd = -1; + +static int prog_load_cnt(int verdict, int val) +{ + if (map_fd < 0) + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + + struct bpf_insn prog[] = { + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + int ret; + + ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); + + if (ret < 0) { + log_err("Loading program"); + printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); + return 0; + } + return ret; +} + + +static int test_multiprog(void) +{ + __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id; + int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; + int drop_prog, allow_prog[6] = {}, rc = 0; + unsigned long long value; + int i = 0; + + for (i = 0; i < 6; i++) { + allow_prog[i] = prog_load_cnt(1, 1 << i); + if (!allow_prog[i]) + goto err; + } + drop_prog = prog_load_cnt(0, 1); + if (!drop_prog) + goto err; + + if (setup_cgroup_environment()) + goto err; + + cg1 = create_and_get_cgroup("/cg1"); + if (!cg1) + goto err; + cg2 = create_and_get_cgroup("/cg1/cg2"); + if (!cg2) + goto err; + cg3 = create_and_get_cgroup("/cg1/cg2/cg3"); + if (!cg3) + goto err; + cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4"); + if (!cg4) + goto err; + cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5"); + if (!cg5) + goto err; + + if (join_cgroup("/cg1/cg2/cg3/cg4/cg5")) + goto err; + + if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog to cg1"); + goto err; + } + if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Unexpected success attaching the same prog to cg1"); + goto err; + } + if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog2 to cg1"); + goto err; + } + if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, 1)) { + log_err("Attaching prog to cg2"); + goto err; + } + if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog to cg3"); + goto err; + } + if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, 1)) { + log_err("Attaching prog to cg4"); + goto err; + } + if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) { + log_err("Attaching prog to cg5"); + goto err; + } + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 8 + 32); + + /* query the number of effective progs in cg5 */ + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + NULL, NULL, &prog_cnt) == 0); + assert(prog_cnt == 4); + /* retrieve prog_ids of effective progs in cg5 */ + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 4); + assert(attach_flags == 0); + saved_prog_id = prog_ids[0]; + /* check enospc handling */ + prog_ids[0] = 0; + prog_cnt = 2; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == -1 && + errno == ENOSPC); + assert(prog_cnt == 4); + /* check that prog_ids are returned even when buffer is too small */ + assert(prog_ids[0] == saved_prog_id); + /* retrieve prog_id of single attached prog in cg5 */ + prog_ids[0] = 0; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, + NULL, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 1); + assert(prog_ids[0] == saved_prog_id); + + /* detach bottom program and ping again */ + if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching prog from cg5"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 8 + 16); + + /* detach 3rd from bottom program and ping again */ + errno = 0; + if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) { + log_err("Unexpected success on detach from cg3"); + goto err; + } + if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching from cg3"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 16); + + /* detach 2nd from bottom program and ping again */ + if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching prog from cg4"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 4); + + prog_cnt = 4; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 3); + assert(attach_flags == 0); + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, + NULL, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 0); + goto out; +err: + rc = 1; + +out: + for (i = 0; i < 6; i++) + if (allow_prog[i] > 0) + close(allow_prog[i]); + close(cg1); + close(cg2); + close(cg3); + close(cg4); + close(cg5); + cleanup_cgroup_environment(); + if (!rc) + printf("### multi:PASS\n"); + else + printf("### multi:FAIL\n"); + return rc; +} + +int main(int argc, char **argv) +{ + int rc = 0; + + rc = test_foo_bar(); + if (rc) + return rc; + + return test_multiprog(); +} diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index 41b6115a32eb..a77a583d94d4 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = { SEC("perf_event") int bpf_prog1(struct bpf_perf_event_data *ctx) { + char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; + char time_fmt2[] = "Get Time Failed, ErrCode: %d"; char fmt[] = "CPU-%d period %lld ip %llx"; u32 cpu = bpf_get_smp_processor_id(); + struct bpf_perf_event_value value_buf; struct key_t key; u64 *val, one = 1; + int ret; if (ctx->sample_period < 10000) /* ignore warmup */ @@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) return 0; } + ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value)); + if (!ret) + bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running); + else + bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); + val = bpf_map_lookup_elem(&counts, &key); if (val) (*val)++; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index 7bd827b84a67..bf4f1b6d9a52 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr) int *pmu_fd = malloc(nr_cpus * sizeof(int)); int i, error = 0; + /* system wide perf event, no need to inherit */ + attr->inherit = 0; + /* open perf_event on all cpus */ for (i = 0; i < nr_cpus; i++) { pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0); @@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr) { int pmu_fd; + /* per task perf event, enable inherit so the "dd ..." command can be traced properly. + * Enabling inherit will cause bpf_perf_prog_read_time helper failure. + */ + attr->inherit = 1; + /* open task bound event */ pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0); if (pmu_fd < 0) { @@ -175,14 +183,12 @@ static void test_bpf_perf_event(void) .freq = 1, .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .inherit = 1, }; struct perf_event_attr attr_type_sw = { .sample_freq = SAMPLE_FREQ, .freq = 1, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, - .inherit = 1, }; struct perf_event_attr attr_hw_cache_l1d = { .sample_freq = SAMPLE_FREQ, @@ -192,7 +198,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), - .inherit = 1, }; struct perf_event_attr attr_hw_cache_branch_miss = { .sample_freq = SAMPLE_FREQ, @@ -202,7 +207,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_BPU | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), - .inherit = 1, }; struct perf_event_attr attr_type_raw = { .sample_freq = SAMPLE_FREQ, @@ -210,7 +214,6 @@ static void test_bpf_perf_event(void) .type = PERF_TYPE_RAW, /* Intel Instruction Retired */ .config = 0xc0, - .inherit = 1, }; printf("Test HW_CPU_CYCLES\n"); diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index e7d180305974..46c557afac73 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = { .value_size = sizeof(u64), .max_entries = 64, }; +struct bpf_map_def SEC("maps") values2 = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(int), + .value_size = sizeof(struct bpf_perf_event_value), + .max_entries = 64, +}; SEC("kprobe/htab_map_get_next_key") int bpf_prog1(struct pt_regs *ctx) @@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } +SEC("kprobe/htab_map_lookup_elem") +int bpf_prog2(struct pt_regs *ctx) +{ + u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *val, buf; + int error; + + error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf)); + if (error) + return 0; + + val = bpf_map_lookup_elem(&values2, &key); + if (val) + *val = buf; + else + bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST); + + return 0; +} + char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index a8c22dcf8e4e..89ab8d408474 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -23,6 +23,7 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) { + struct bpf_perf_event_value value2; int pmu_fd, error = 0; cpu_set_t set; __u64 value; @@ -47,8 +48,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) fprintf(stderr, "Value missing for CPU %d\n", cpu); error = 1; goto on_exit; + } else { + fprintf(stderr, "CPU %d: %llu\n", cpu, value); + } + /* The above bpf_map_lookup_elem should trigger the second kprobe */ + if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) { + fprintf(stderr, "Value2 missing for CPU %d\n", cpu); + error = 1; + goto on_exit; + } else { + fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu, + value2.counter, value2.enabled, value2.running); } - fprintf(stderr, "CPU %d: %llu\n", cpu, value); on_exit: assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error); diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index 2431c0321b71..fdaefe91801d 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -14,6 +14,7 @@ #include <string.h> #include <unistd.h> #include <libgen.h> +#include <sys/resource.h> #include "bpf_load.h" #include "bpf_util.h" @@ -69,6 +70,7 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; const char *optstr = "SN"; char filename[256]; int opt; @@ -91,6 +93,12 @@ int main(int argc, char **argv) usage(basename(argv[0])); return 1; } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + ifindex = strtoul(argv[optind], NULL, 0); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 74f3fd8ed729..2fe2f761a0d0 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -13,23 +13,27 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = { /* TODO: have entries for all possible errno's */ }; +#define XDP_UNKNOWN XDP_REDIRECT + 1 +struct bpf_map_def SEC("maps") exception_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = XDP_UNKNOWN + 1, +}; + /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format * Code in: kernel/include/trace/events/xdp.h */ struct xdp_redirect_ctx { - unsigned short common_type; // offset:0; size:2; signed:0; - unsigned char common_flags; // offset:2; size:1; signed:0; - unsigned char common_preempt_count;// offset:3; size:1; signed:0; - int common_pid; // offset:4; size:4; signed:1; - - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12 size:4; signed:0; - int ifindex; // offset:16 size:4; signed:1; - int err; // offset:20 size:4; signed:1; - int to_ifindex; // offset:24 size:4; signed:1; - u32 map_id; // offset:28 size:4; signed:0; - int map_index; // offset:32 size:4; signed:1; -}; // offset:36 + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 enum { XDP_REDIRECT_SUCCESS = 0, @@ -48,7 +52,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); if (!cnt) - return 0; + return 1; *cnt += 1; return 0; /* Indicate event was filtered (no further processing)*/ @@ -86,3 +90,31 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) { return xdp_redirect_collect_stat(ctx); } + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + u64 *cnt;; + u32 key; + + key = ctx->act; + if (key > XDP_REDIRECT) + key = XDP_UNKNOWN; + + cnt = bpf_map_lookup_elem(&exception_cnt, &key); + if (!cnt) + return 1; + *cnt += 1; + + return 0; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index b51b4f5e3257..eaba165b3549 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -20,6 +20,7 @@ static const char *__doc_err_only__= #include <unistd.h> #include <locale.h> +#include <sys/resource.h> #include <getopt.h> #include <net/if.h> #include <time.h> @@ -61,7 +62,7 @@ static void usage(char *argv[]) } #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ -__u64 gettime(void) +static __u64 gettime(void) { struct timespec t; int res; @@ -89,6 +90,23 @@ static const char *err2str(int err) return redir_names[err]; return NULL; } +/* enum xdp_action */ +#define XDP_UNKNOWN XDP_REDIRECT + 1 +#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", + [XDP_UNKNOWN] = "XDP_UNKNOWN", +}; +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} struct record { __u64 counter; @@ -97,6 +115,7 @@ struct record { struct stats_record { struct record xdp_redir[REDIR_RES_MAX]; + struct record xdp_exception[XDP_ACTION_MAX]; }; static void stats_print_headers(bool err_only) @@ -104,39 +123,72 @@ static void stats_print_headers(bool err_only) if (err_only) printf("\n%s\n", __doc_err_only__); - printf("%-14s %-10s %-18s %-9s\n", - "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period"); + printf("%-14s %-11s %-10s %-18s %-9s\n", + "ACTION", "result", "pps ", "pps-human-readable", "measure-period"); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_pps(struct record *r, struct record *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->counter - p->counter; + pps = packets / period; + } + return pps; } static void stats_print(struct stats_record *rec, struct stats_record *prev, bool err_only) { + double period = 0, pps = 0; + struct record *r, *p; int i = 0; + char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n"; + + /* tracepoint: xdp:xdp_redirect_* */ if (err_only) i = REDIR_ERROR; for (; i < REDIR_RES_MAX; i++) { - struct record *r = &rec->xdp_redir[i]; - struct record *p = &prev->xdp_redir[i]; - __u64 period = 0; - __u64 packets = 0; - double pps = 0; - double period_ = 0; + r = &rec->xdp_redir[i]; + p = &prev->xdp_redir[i]; if (p->timestamp) { - packets = r->counter - p->counter; - period = r->timestamp - p->timestamp; - if (period > 0) { - period_ = ((double) period / NANOSEC_PER_SEC); - pps = packets / period_; - } + period = calc_period(r, p); + pps = calc_pps(r, p, period); } + printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period); + } - printf("%-14s %-10.0f %'-18.0f %f\n", - err2str(i), pps, pps, period_); + /* tracepoint: xdp:xdp_exception */ + for (i = 0; i < XDP_ACTION_MAX; i++) { + r = &rec->xdp_exception[i]; + p = &prev->xdp_exception[i]; + if (p->timestamp) { + period = calc_period(r, p); + pps = calc_pps(r, p, period); + } + if (pps > 0) + printf(fmt, action2str(i), "Exception", + pps, pps, period); } + printf("\n"); } static __u64 get_key32_value64_percpu(int fd, __u32 key) @@ -160,25 +212,33 @@ static __u64 get_key32_value64_percpu(int fd, __u32 key) return sum; } -static bool stats_collect(int fd, struct stats_record *rec) +static bool stats_collect(struct stats_record *rec) { + int fd; int i; /* TODO: Detect if someone unloaded the perf event_fd's, as * this can happen by someone running perf-record -e */ + fd = map_data[0].fd; /* map0: redirect_err_cnt */ for (i = 0; i < REDIR_RES_MAX; i++) { rec->xdp_redir[i].timestamp = gettime(); rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i); } + + fd = map_data[1].fd; /* map1: exception_cnt */ + for (i = 0; i < XDP_ACTION_MAX; i++) { + rec->xdp_exception[i].timestamp = gettime(); + rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i); + } + return true; } static void stats_poll(int interval, bool err_only) { struct stats_record rec, prev; - int map_fd; memset(&rec, 0, sizeof(rec)); @@ -190,23 +250,24 @@ static void stats_poll(int interval, bool err_only) printf("\n%s", __doc__); /* TODO Need more advanced stats on error types */ - if (verbose) - printf(" - Stats map: %s\n", map_data[0].name); - map_fd = map_data[0].fd; - - stats_print_headers(err_only); + if (verbose) { + printf(" - Stats map0: %s\n", map_data[0].name); + printf(" - Stats map1: %s\n", map_data[1].name); + printf("\n"); + } fflush(stdout); while (1) { memcpy(&prev, &rec, sizeof(rec)); - stats_collect(map_fd, &rec); + stats_collect(&rec); + stats_print_headers(err_only); stats_print(&rec, &prev, err_only); fflush(stdout); sleep(interval); } } -void print_bpf_prog_info(void) +static void print_bpf_prog_info(void) { int i; @@ -235,6 +296,7 @@ void print_bpf_prog_info(void) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int longindex = 0, opt; int ret = EXIT_SUCCESS; char bpf_obj_file[256]; @@ -265,13 +327,18 @@ int main(int argc, char **argv) } } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return EXIT_FAILURE; + } + if (load_bpf_file(bpf_obj_file)) { printf("ERROR - bpf_log_buf: %s", bpf_log_buf); - return 1; + return EXIT_FAILURE; } if (!prog_fd[0]) { printf("ERROR - load_bpf_file: %s\n", strerror(errno)); - return 1; + return EXIT_FAILURE; } if (debug) { diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c new file mode 100644 index 000000000000..303e9e7161f3 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_kern.c @@ -0,0 +1,609 @@ +/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP) + * + * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/if_vlan.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/udp.h> + +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +#define MAX_CPUS 12 /* WARNING - sync with _user.c */ + +/* Special map type that can XDP_REDIRECT frames to another CPU */ +struct bpf_map_def SEC("maps") cpu_map = { + .type = BPF_MAP_TYPE_CPUMAP, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = MAX_CPUS, +}; + +/* Common stats data record to keep userspace more simple */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 issue; +}; + +/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success + * feedback. Redirect TX errors can be caught via a tracepoint. + */ +struct bpf_map_def SEC("maps") rx_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") redirect_err_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 2, + /* TODO: have entries for all possible errno's */ +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_CPUS, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Set of maps controlling available CPU, and for iterating through + * selectable redirect CPUs. + */ +struct bpf_map_def SEC("maps") cpus_available = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = MAX_CPUS, +}; +struct bpf_map_def SEC("maps") cpus_count = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; +struct bpf_map_def SEC("maps") cpus_iterator = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") exception_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Helper parse functions */ + +/* Parse Ethernet layer 2, extract network layer 3 offset and protocol + * + * Returns false on error and non-supported ether-type + */ +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +static __always_inline +bool parse_eth(struct ethhdr *eth, void *data_end, + u16 *eth_proto, u64 *l3_offset) +{ + u16 eth_type; + u64 offset; + + offset = sizeof(*eth); + if ((void *)eth + offset > data_end) + return false; + + eth_type = eth->h_proto; + + /* Skip non 802.3 Ethertypes */ + if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN)) + return false; + + /* Handle VLAN tagged packet */ + if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { + struct vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + offset += sizeof(*vlan_hdr); + if ((void *)eth + offset > data_end) + return false; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + } + /* TODO: Handle double VLAN tagged packet */ + + *eth_proto = ntohs(eth_type); + *l3_offset = offset; + return true; +} + +static __always_inline +u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + struct udphdr *udph; + u16 dport; + + if (iph + 1 > data_end) + return 0; + if (!(iph->protocol == IPPROTO_UDP)) + return 0; + + udph = (void *)(iph + 1); + if (udph + 1 > data_end) + return 0; + + dport = ntohs(udph->dest); + return dport; +} + +static __always_inline +int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static __always_inline +int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp_cpu_map0") +int xdp_prognum0_no_touch(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest; + u32 key = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map1_touch_data") +int xdp_prognum1_touch_data(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest; + u16 eth_type; + u32 key = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Validate packet length is minimum Eth header size */ + if (eth + 1 > data_end) + return XDP_ABORTED; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + /* Read packet data, and use it (drop non 802.3 Ethertypes) */ + eth_type = eth->h_proto; + if (ntohs(eth_type) < ETH_P_802_3_MIN) { + rec->dropped++; + return XDP_DROP; + } + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map2_round_robin") +int xdp_prognum2_round_robin(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + struct datarec *rec; + u32 cpu_dest; + u32 *cpu_lookup; + u32 key0 = 0; + + u32 *cpu_selected; + u32 *cpu_iterator; + u32 *cpu_max; + u32 cpu_idx; + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); + if (!cpu_max) + return XDP_ABORTED; + + cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0); + if (!cpu_iterator) + return XDP_ABORTED; + cpu_idx = *cpu_iterator; + + *cpu_iterator += 1; + if (*cpu_iterator == *cpu_max) + *cpu_iterator = 0; + + cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key0); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map3_proto_separate") +int xdp_prognum3_proto_separate(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u32 *cpu_lookup; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map4_ddos_filter_pktgen") +int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u16 dest_port; + u32 *cpu_lookup; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + /* DDoS filter UDP port 9 (pktgen) */ + dest_port = get_dest_port_ipv4_udp(ctx, l3_offset); + if (dest_port == 9) { + if (rec) + rec->dropped++; + return XDP_DROP; + } + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + + +char _license[] SEC("license") = "GPL"; + +/*** Trace point code ***/ + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_redirect_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline +int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) +{ + u32 key = XDP_REDIRECT_ERROR; + struct datarec *rec; + int err = ctx->err; + + if (!err) + key = XDP_REDIRECT_SUCCESS; + + rec = bpf_map_lookup_elem(&redirect_err_cnt, &key); + if (!rec) + return 0; + rec->dropped += 1; + + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tracepoint/xdp/xdp_redirect_err") +int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +SEC("tracepoint/xdp/xdp_redirect_map_err") +int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&exception_cnt, &key); + if (!rec) + return 1; + rec->dropped += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->issue += 1; + + /* Inception: It's possible to detect overload situations, via + * this tracepoint. This can be used for creating a feedback + * loop to XDP, which can take appropriate actions to mitigate + * this overload situation. + */ + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->issue++; + + return 0; +} diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c new file mode 100644 index 000000000000..35fec9fecb57 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -0,0 +1,697 @@ +/* GPLv2 Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +static const char *__doc__ = + " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\""; + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <locale.h> +#include <sys/resource.h> +#include <getopt.h> +#include <net/if.h> +#include <time.h> + +#include <arpa/inet.h> +#include <linux/if_link.h> + +#define MAX_CPUS 12 /* WARNING - sync with _kern.c */ + +/* How many xdp_progs are defined in _kern.c */ +#define MAX_PROG 5 + +/* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead + * use bpf/libbpf.h), but cannot as (currently) needed for XDP + * attaching to a device via set_link_xdp_fd() + */ +#include "libbpf.h" +#include "bpf_load.h" + +#include "bpf_util.h" + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; + +static __u32 xdp_flags; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"debug", no_argument, NULL, 'D' }, + {"sec", required_argument, NULL, 's' }, + {"prognum", required_argument, NULL, 'p' }, + {"qsize", required_argument, NULL, 'q' }, + {"cpu", required_argument, NULL, 'c' }, + {"stress-mode", no_argument, NULL, 'x' }, + {"no-separators", no_argument, NULL, 'z' }, + {0, 0, NULL, 0 } +}; + +static void int_exit(int sig) +{ + fprintf(stderr, + "Interrupted: Removing XDP program on ifindex:%d device:%s\n", + ifindex, ifname); + if (ifindex > -1) + set_link_xdp_fd(ifindex, -1, xdp_flags); + exit(EXIT_OK); +} + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +/* gettime returns the current time of day in nanoseconds. + * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC) + * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE) + */ +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAIL); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 issue; +}; +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct stats_record { + struct record rx_cnt; + struct record redir_err; + struct record kthread; + struct record exception; + struct record enq[MAX_CPUS]; +}; + +static bool map_collect_percpu(int fd, __u32 key, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_issue = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].issue = values[i].issue; + sum_issue += values[i].issue; + } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.issue = sum_issue; + return true; +} + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec *array; + size_t size; + + size = sizeof(struct datarec) * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int i; + + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + rec->rx_cnt.cpu = alloc_record_per_cpu(); + rec->redir_err.cpu = alloc_record_per_cpu(); + rec->kthread.cpu = alloc_record_per_cpu(); + rec->exception.cpu = alloc_record_per_cpu(); + for (i = 0; i < MAX_CPUS; i++) + rec->enq[i].cpu = alloc_record_per_cpu(); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < MAX_CPUS; i++) + free(r->enq[i].cpu); + free(r->exception.cpu); + free(r->kthread.cpu); + free(r->redir_err.cpu); + free(r->rx_cnt.cpu); + free(r); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->dropped - p->dropped; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, + struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = packets / period_; + } + return pps; +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + int prog_num) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + double pps = 0, drop = 0, err = 0; + struct record *rec, *prev; + int to_cpu; + double t; + int i; + + /* Header */ + printf("Running XDP/eBPF prog_num:%d\n", prog_num); + printf("%-15s %-7s %-14s %-11s %-9s\n", + "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info"); + + /* XDP rx_cnt */ + { + char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n"; + char *errstr = ""; + + rec = &stats_rec->rx_cnt; + prev = &stats_prev->rx_cnt; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + errstr = "cpu-dest/err"; + if (pps > 0) + printf(fmt_rx, "XDP-RX", + i, pps, drop, err, errstr); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + printf(fm2_rx, "XDP-RX", "total", pps, drop); + } + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { + char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; + char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; + char *errstr = ""; + + rec = &stats_rec->enq[to_cpu]; + prev = &stats_prev->enq[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) { + errstr = "bulk-average"; + err = pps / err; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt, "cpumap-enqueue", + i, to_cpu, pps, drop, err, errstr); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (err > 0) { + errstr = "bulk-average"; + err = pps / err; /* calc average bulk size */ + } + printf(fm2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, err, errstr); + } + } + + /* cpumap kthread stats */ + { + char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *e_str = ""; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + e_str = "sched"; + if (pps > 0) + printf(fmt_k, "cpumap_kthread", + i, pps, drop, err, e_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (err > 0) + e_str = "sched-sum"; + printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str); + } + + /* XDP redirect err tracepoints (very unlikely) */ + { + char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; + char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; + + rec = &stats_rec->redir_err; + prev = &stats_prev->redir_err; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + if (pps > 0) + printf(fmt_err, "redirect_err", i, pps, drop); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + printf(fm2_err, "redirect_err", "total", pps, drop); + } + + /* XDP general exception tracepoints */ + { + char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; + char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; + + rec = &stats_rec->exception; + prev = &stats_prev->exception; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + if (pps > 0) + printf(fmt_err, "xdp_exception", i, pps, drop); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + printf(fm2_err, "xdp_exception", "total", pps, drop); + } + + printf("\n"); + fflush(stdout); +} + +static void stats_collect(struct stats_record *rec) +{ + int fd, i; + + fd = map_fd[1]; /* map: rx_cnt */ + map_collect_percpu(fd, 0, &rec->rx_cnt); + + fd = map_fd[2]; /* map: redirect_err_cnt */ + map_collect_percpu(fd, 1, &rec->redir_err); + + fd = map_fd[3]; /* map: cpumap_enqueue_cnt */ + for (i = 0; i < MAX_CPUS; i++) + map_collect_percpu(fd, i, &rec->enq[i]); + + fd = map_fd[4]; /* map: cpumap_kthread_cnt */ + map_collect_percpu(fd, 0, &rec->kthread); + + fd = map_fd[8]; /* map: exception_cnt */ + map_collect_percpu(fd, 0, &rec->exception); +} + + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static int create_cpu_entry(__u32 cpu, __u32 queue_size, + __u32 avail_idx, bool new) +{ + __u32 curr_cpus_count = 0; + __u32 key = 0; + int ret; + + /* Add a CPU entry to cpumap, as this allocate a cpu entry in + * the kernel for the cpu. + */ + ret = bpf_map_update_elem(map_fd[0], &cpu, &queue_size, 0); + if (ret) { + fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); + exit(EXIT_FAIL_BPF); + } + + /* Inform bpf_prog's that a new CPU is available to select + * from via some control maps. + */ + /* map_fd[5] = cpus_available */ + ret = bpf_map_update_elem(map_fd[5], &avail_idx, &cpu, 0); + if (ret) { + fprintf(stderr, "Add to avail CPUs failed\n"); + exit(EXIT_FAIL_BPF); + } + + /* When not replacing/updating existing entry, bump the count */ + /* map_fd[6] = cpus_count */ + ret = bpf_map_lookup_elem(map_fd[6], &key, &curr_cpus_count); + if (ret) { + fprintf(stderr, "Failed reading curr cpus_count\n"); + exit(EXIT_FAIL_BPF); + } + if (new) { + curr_cpus_count++; + ret = bpf_map_update_elem(map_fd[6], &key, &curr_cpus_count, 0); + if (ret) { + fprintf(stderr, "Failed write curr cpus_count\n"); + exit(EXIT_FAIL_BPF); + } + } + /* map_fd[7] = cpus_iterator */ + printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n", + new ? "Add-new":"Replace", cpu, avail_idx, + queue_size, curr_cpus_count); + + return 0; +} + +/* CPUs are zero-indexed. Thus, add a special sentinel default value + * in map cpus_available to mark CPU index'es not configured + */ +static void mark_cpus_unavailable(void) +{ + __u32 invalid_cpu = MAX_CPUS; + int ret, i; + + for (i = 0; i < MAX_CPUS; i++) { + /* map_fd[5] = cpus_available */ + ret = bpf_map_update_elem(map_fd[5], &i, &invalid_cpu, 0); + if (ret) { + fprintf(stderr, "Failed marking CPU unavailable\n"); + exit(EXIT_FAIL_BPF); + } + } +} + +/* Stress cpumap management code by concurrently changing underlying cpumap */ +static void stress_cpumap(void) +{ + /* Changing qsize will cause kernel to free and alloc a new + * bpf_cpu_map_entry, with an associated/complicated tear-down + * procedure. + */ + create_cpu_entry(1, 1024, 0, false); + create_cpu_entry(1, 128, 0, false); + create_cpu_entry(1, 16000, 0, false); +} + +static void stats_poll(int interval, bool use_separators, int prog_num, + bool stress_mode) +{ + struct stats_record *record, *prev; + + record = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(record); + + /* Trick to pretty printf with thousands separators use %' */ + if (use_separators) + setlocale(LC_NUMERIC, "en_US"); + + while (1) { + swap(&prev, &record); + stats_collect(record); + stats_print(record, prev, prog_num); + sleep(interval); + if (stress_mode) + stress_cpumap(); + } + + free_stats_record(record); + free_stats_record(prev); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + bool use_separators = true; + bool stress_mode = false; + char filename[256]; + bool debug = false; + int added_cpus = 0; + int longindex = 0; + int interval = 2; + int prog_num = 0; + int add_cpu = -1; + __u32 qsize; + int opt; + + /* Notice: choosing he queue size is very important with the + * ixgbe driver, because it's driver page recycling trick is + * dependend on pages being returned quickly. The number of + * out-standing packets in the system must be less-than 2x + * RX-ring size. + */ + qsize = 128+64; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_bpf_file(filename)) { + fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf); + return EXIT_FAIL; + } + + if (!prog_fd[0]) { + fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno)); + return EXIT_FAIL; + } + + mark_cpus_unavailable(); + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hSd:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 's': + interval = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'D': + debug = true; + break; + case 'x': + stress_mode = true; + break; + case 'z': + use_separators = false; + break; + case 'p': + /* Selecting eBPF prog to load */ + prog_num = atoi(optarg); + if (prog_num < 0 || prog_num >= MAX_PROG) { + fprintf(stderr, + "--prognum too large err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 'c': + /* Add multiple CPUs */ + add_cpu = strtoul(optarg, NULL, 0); + if (add_cpu >= MAX_CPUS) { + fprintf(stderr, + "--cpu nr too large for cpumap err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + create_cpu_entry(add_cpu, qsize, added_cpus, true); + added_cpus++; + break; + case 'q': + qsize = atoi(optarg); + break; + case 'h': + error: + default: + usage(argv); + return EXIT_FAIL_OPTION; + } + } + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + /* Required option */ + if (add_cpu == -1) { + fprintf(stderr, "ERR: required option --cpu missing\n"); + fprintf(stderr, " Specify multiple --cpu option to add more\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + + /* Remove XDP program when program is interrupted */ + signal(SIGINT, int_exit); + + if (set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) { + fprintf(stderr, "link set xdp fd failed\n"); + return EXIT_FAIL_XDP; + } + + if (debug) { + printf("Debug-mode reading trace pipe (fix #define DEBUG)\n"); + read_trace_pipe(); + } + + stats_poll(interval, use_separators, prog_num, stress_mode); + return EXIT_OK; +} diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index d4d86a273fba..978a532f0748 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -20,6 +20,7 @@ #include <string.h> #include <unistd.h> #include <libgen.h> +#include <sys/resource.h> #include "bpf_load.h" #include "bpf_util.h" @@ -74,6 +75,7 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; const char *optstr = "SN"; char filename[256]; int ret, opt, key = 0; @@ -97,6 +99,11 @@ int main(int argc, char **argv) return 1; } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + ifindex_in = strtoul(argv[optind], NULL, 0); ifindex_out = strtoul(argv[optind + 1], NULL, 0); printf("input: %d output: %d\n", ifindex_in, ifindex_out); diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh index 205e4cde4601..f8bb3cd0f4ce 100644 --- a/samples/pktgen/functions.sh +++ b/samples/pktgen/functions.sh @@ -119,3 +119,46 @@ function root_check_run_with_sudo() { err 4 "cannot perform sudo run of $0" fi } + +# Exact input device's NUMA node info +function get_iface_node() +{ + local node=$(</sys/class/net/$1/device/numa_node) + if [[ $node == -1 ]]; then + echo 0 + else + echo $node + fi +} + +# Given an Dev/iface, get its queues' irq numbers +function get_iface_irqs() +{ + local IFACE=$1 + local queues="${IFACE}-.*TxRx" + + irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:) + [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:) + [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\ + do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\ + done) + [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE" + + echo $irqs +} + +# Given a NUMA node, return cpu ids belonging to it. +function get_node_cpus() +{ + local node=$1 + local node_cpu_list + local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \ + /sys/devices/system/node/node$node/cpulist` + + for cpu_range in $node_cpu_range_list + do + node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }` + done + + echo $node_cpu_list +} diff --git a/samples/pktgen/pktgen.conf-1-1-ip6 b/samples/pktgen/pktgen.conf-1-1-ip6 deleted file mode 100755 index 62426afeef84..000000000000 --- a/samples/pktgen/pktgen.conf-1-1-ip6 +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. -# IPv6. Note increase in minimal packet length - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 - -CLONE_SKB="clone_skb 1000000" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 66" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst6 fec0::1" - pgset "src6 fec0::2" - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen.conf-1-1-ip6-rdos b/samples/pktgen/pktgen.conf-1-1-ip6-rdos deleted file mode 100755 index 3ac3eb1f3504..000000000000 --- a/samples/pktgen/pktgen.conf-1-1-ip6-rdos +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. -# IPv6. Note increase in minimal packet length - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 means maximum speed. - -# We need to do alloc for every skb since we cannot clone here. -CLONE_SKB="clone_skb 0" - -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 66" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst6_min fec0::1" - pgset "dst6_max fec0::FFFF:FFFF" - - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen.conf-1-2 b/samples/pktgen/pktgen.conf-1-2 deleted file mode 100755 index a85552760762..000000000000 --- a/samples/pktgen/pktgen.conf-1-2 +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# One CPU means one thread. One CPU example. We add eth1, eth2 respectivly. - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - echo "Adding eth2" - pgset "add_device eth2" - - -# device config -# delay 0 means maximum speed. - -CLONE_SKB="clone_skb 1000000" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 60" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst 10.10.11.2" - pgset "dst_mac 00:04:23:08:91:dc" - -PGDEV=/proc/net/pktgen/eth2 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst 192.168.2.2" - pgset "dst_mac 00:04:23:08:91:de" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 /proc/net/pktgen/eth2 diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh index 4c2e4217638a..8fdd36722d9e 100755 --- a/samples/pktgen/pktgen_sample03_burst_single_flow.sh +++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh @@ -31,7 +31,7 @@ if [ -z "$DEST_IP" ]; then fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$BURST" ] && BURST=32 -[ -z "$CLONE_SKB" ] && CLONE_SKB="100000" +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely # Base Config diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh new file mode 100755 index 000000000000..353adc17205e --- /dev/null +++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# +# Multiqueue: Using pktgen threads for sending on multiple CPUs +# * adding devices to kernel threads which are in the same NUMA node +# * bound devices queue's irq affinity to the threads, 1:1 mapping +# * notice the naming scheme for keeping device names unique +# * nameing scheme: dev@thread_number +# * flow variation via random UDP source port +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" +# +# Required param: -i dev in $DEV +source ${basedir}/parameters.sh + +# Base Config +DELAY="0" # Zero means max speed +[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" + +# Flow variation random source port between min and max +UDP_MIN=9 +UDP_MAX=109 + +node=`get_iface_node $DEV` +irq_array=(`get_iface_irqs $DEV`) +cpu_array=(`get_node_cpus $node`) + +[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \ + err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})" + +# (example of setting default params in your script) +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((i = 0; i < $THREADS; i++)); do + # The device name is extended with @name, using thread number to + # make then unique, but any name will do. + # Set the queue's irq affinity to this $thread (processor) + # if '-f' is designated, offset cpu id + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list + info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`" + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # select queue and bind the queue and $dev in 1:1 relationship + queue_num=$i + info "queue number is $queue_num" + pg_set $dev "queue_map_min $queue_num" + pg_set $dev "queue_map_max $queue_num" + + # Notice config queue to map to cpu (mirrors smp_processor_id()) + # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number + pg_set $dev "flag QUEUE_MAP_CPU" + + # Base config of dev + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + + # Flag example disabling timestamping + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst$IP6 $DEST_IP" + + # Setup random UDP port src range + pg_set $dev "flag UDPSRC_RND" + pg_set $dev "udp_src_min $UDP_MIN" + pg_set $dev "udp_src_max $UDP_MAX" +done + +# start_run +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" +echo "Done" >&2 + +# Print results +for ((i = 0; i < $THREADS; i++)); do + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" +done |