diff options
author | David S. Miller <davem@davemloft.net> | 2017-08-07 23:09:48 +0200 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-08-07 23:09:48 +0200 |
commit | 1c14dc48503b4c385a4f039341300cefa00cbdd4 (patch) | |
tree | 172104216444020892b0fbbd0f4de8a72e8686a9 | |
parent | of_mdio: use of_property_read_u32_array() (diff) | |
parent | bpf: add a test case for syscalls/sys_{enter|exit}_* tracepoints (diff) | |
download | linux-1c14dc48503b4c385a4f039341300cefa00cbdd4.tar.xz linux-1c14dc48503b4c385a4f039341300cefa00cbdd4.zip |
Merge branch 'bpf-add-support-for-sys-enter-exit-tracepoints'
Yonghong Song says:
====================
bpf: add support for sys_{enter|exit}_* tracepoints
Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_*
style tracepoints. The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_*
tracepoints are treated differently from other tracepoints and there
is no bpf hook to it.
This patch set adds bpf support for these syscalls tracepoints and also
adds a test case for it.
Changelogs:
v3 -> v4:
- Check the legality of ctx offset access for syscall tracepoint as well.
trace_event_get_offsets will return correct max offset for each
specific syscall tracepoint.
- Use variable length array to avoid hardcode 6 as the maximum
arguments beyond syscall_nr.
v2 -> v3:
- Fix a build issue
v1 -> v2:
- Do not use TRACE_EVENT_FL_CAP_ANY to identify syscall tracepoint.
Instead use trace_event_call->class.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/syscalls.h | 12 | ||||
-rw-r--r-- | kernel/events/core.c | 10 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 53 | ||||
-rw-r--r-- | samples/bpf/Makefile | 4 | ||||
-rw-r--r-- | samples/bpf/syscall_tp_kern.c | 62 | ||||
-rw-r--r-- | samples/bpf/syscall_tp_user.c | 71 |
6 files changed, 206 insertions, 6 deletions
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3cb15ea48aee..c9170218e9e6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -172,8 +172,20 @@ extern struct trace_event_functions exit_syscall_print_funcs; static struct syscall_metadata __used \ __attribute__((section("__syscalls_metadata"))) \ *__p_syscall_meta_##sname = &__syscall_meta_##sname; + +static inline int is_syscall_trace_event(struct trace_event_call *tp_event) +{ + return tp_event->class == &event_class_syscall_enter || + tp_event->class == &event_class_syscall_exit; +} + #else #define SYSCALL_METADATA(sname, nb, ...) + +static inline int is_syscall_trace_event(struct trace_event_call *tp_event) +{ + return 0; +} #endif #define SYSCALL_DEFINE0(sname) \ diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..a7a6c1d19a49 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8050,7 +8050,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { - bool is_kprobe, is_tracepoint; + bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; if (event->attr.type != PERF_TYPE_TRACEPOINT) @@ -8061,7 +8061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; - if (!is_kprobe && !is_tracepoint) + is_syscall_tp = is_syscall_trace_event(event->tp_event); + if (!is_kprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; @@ -8070,13 +8071,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return PTR_ERR(prog); if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || - (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; } - if (is_tracepoint) { + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) { diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e10395da88e..7a1a92036563 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; +static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, + struct syscall_metadata *sys_data, + struct syscall_trace_enter *rec) { + struct syscall_tp_t { + unsigned long long regs; + unsigned long syscall_nr; + unsigned long args[sys_data->nb_args]; + } param; + int i; + + *(struct pt_regs **)¶m = regs; + param.syscall_nr = rec->nr; + for (i = 0; i < sys_data->nb_args; i++) + param.args[i] = rec->args[i]; + return trace_call_bpf(prog, ¶m); +} + static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; + struct bpf_prog *prog; int syscall_nr; int rctx; int size; @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + prog = READ_ONCE(sys_data->enter_event->prog); head = this_cpu_ptr(sys_data->enter_event->perf_events); - if (hlist_empty(head)) + if (!prog && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); + + if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) || + hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); @@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call) mutex_unlock(&syscall_trace_lock); } +static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, + struct syscall_trace_exit *rec) { + struct syscall_tp_t { + unsigned long long regs; + unsigned long syscall_nr; + unsigned long ret; + } param; + + *(struct pt_regs **)¶m = regs; + param.syscall_nr = rec->nr; + param.ret = rec->ret; + return trace_call_bpf(prog, ¶m); +} + static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; struct hlist_head *head; + struct bpf_prog *prog; int syscall_nr; int rctx; int size; @@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; + prog = READ_ONCE(sys_data->exit_event->prog); head = this_cpu_ptr(sys_data->exit_event->perf_events); - if (hlist_empty(head)) + if (!prog && hlist_empty(head)) return; /* We can probably do that at build time */ @@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); + + if ((prog && !perf_call_bpf_exit(prog, regs, rec)) || + hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 770d46cdf9f4..f1010fe759fe 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -39,6 +39,7 @@ hostprogs-y += per_socket_stats_example hostprogs-y += load_sock_ops hostprogs-y += xdp_redirect hostprogs-y += xdp_redirect_map +hostprogs-y += syscall_tp # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o @@ -82,6 +83,7 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o +syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -125,6 +127,7 @@ always += tcp_iw_kern.o always += tcp_clamp_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o +always += syscall_tp_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -163,6 +166,7 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf HOSTLOADLIBES_test_map_in_map += -lelf HOSTLOADLIBES_xdp_redirect += -lelf HOSTLOADLIBES_xdp_redirect_map += -lelf +HOSTLOADLIBES_syscall_tp += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c new file mode 100644 index 000000000000..9149c524d279 --- /dev/null +++ b/samples/bpf/syscall_tp_kern.c @@ -0,0 +1,62 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +struct syscalls_enter_open_args { + unsigned long long unused; + long syscall_nr; + long filename_ptr; + long flags; + long mode; +}; + +struct syscalls_exit_open_args { + unsigned long long unused; + long syscall_nr; + long ret; +}; + +struct bpf_map_def SEC("maps") enter_open_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") exit_open_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; + +static __always_inline void count(void *map) +{ + u32 key = 0; + u32 *value, init_val = 1; + + value = bpf_map_lookup_elem(map, &key); + if (value) + *value += 1; + else + bpf_map_update_elem(map, &key, &init_val, BPF_NOEXIST); +} + +SEC("tracepoint/syscalls/sys_enter_open") +int trace_enter_open(struct syscalls_enter_open_args *ctx) +{ + count((void *)&enter_open_map); + return 0; +} + +SEC("tracepoint/syscalls/sys_exit_open") +int trace_enter_exit(struct syscalls_exit_open_args *ctx) +{ + count((void *)&exit_open_map); + return 0; +} diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c new file mode 100644 index 000000000000..a3cb91ebf4e7 --- /dev/null +++ b/samples/bpf/syscall_tp_user.c @@ -0,0 +1,71 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdlib.h> +#include <signal.h> +#include <linux/bpf.h> +#include <string.h> +#include <linux/perf_event.h> +#include <errno.h> +#include <assert.h> +#include <stdbool.h> +#include <sys/resource.h> +#include "libbpf.h" +#include "bpf_load.h" + +/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*. + * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. + */ + +static void verify_map(int map_id) +{ + __u32 key = 0; + __u32 val; + + if (bpf_map_lookup_elem(map_id, &key, &val) != 0) { + fprintf(stderr, "map_lookup failed: %s\n", strerror(errno)); + return; + } + if (val == 0) + fprintf(stderr, "failed: map #%d returns value 0\n", map_id); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + int fd; + + setrlimit(RLIMIT_MEMLOCK, &r); + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + fprintf(stderr, "%s", bpf_log_buf); + return 1; + } + + /* current load_bpf_file has perf_event_open default pid = -1 + * and cpu = 0, which permits attached bpf execution on + * all cpus for all pid's. bpf program execution ignores + * cpu affinity. + */ + /* trigger some "open" operations */ + fd = open(filename, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "open failed: %s\n", strerror(errno)); + return 1; + } + close(fd); + + /* verify the map */ + verify_map(map_fd[0]); + verify_map(map_fd[1]); + + return 0; +} |