diff options
author | David S. Miller <davem@davemloft.net> | 2021-07-16 07:40:10 +0200 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2021-07-16 07:40:10 +0200 |
commit | 82a1ffe57ec6455658584459de56ee4b401c1dfb (patch) | |
tree | 66fd801e94825018e15701e508848b6a188bf2c6 /include | |
parent | ipv6: remove unnecessary local variable (diff) | |
parent | Merge branch 'sockmap: add sockmap support for unix datagram socket' (diff) | |
download | linux-82a1ffe57ec6455658584459de56ee4b401c1dfb.tar.xz linux-82a1ffe57ec6455658584459de56ee4b401c1dfb.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says:
====================
pull-request: bpf-next 2021-07-15
The following pull-request contains BPF updates for your *net-next* tree.
We've added 45 non-merge commits during the last 15 day(s) which contain
a total of 52 files changed, 3122 insertions(+), 384 deletions(-).
The main changes are:
1) Introduce bpf timers, from Alexei.
2) Add sockmap support for unix datagram socket, from Cong.
3) Fix potential memleak and UAF in the verifier, from He.
4) Add bpf_get_func_ip helper, from Jiri.
5) Improvements to generic XDP mode, from Kumar.
6) Support for passing xdp_md to XDP programs in bpf_prog_run, from Zvi.
===================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/bitops.h | 50 | ||||
-rw-r--r-- | include/linux/bpf.h | 100 | ||||
-rw-r--r-- | include/linux/bpf_verifier.h | 19 | ||||
-rw-r--r-- | include/linux/btf.h | 1 | ||||
-rw-r--r-- | include/linux/filter.h | 3 | ||||
-rw-r--r-- | include/linux/netdevice.h | 2 | ||||
-rw-r--r-- | include/linux/skbuff.h | 10 | ||||
-rw-r--r-- | include/linux/typecheck.h | 9 | ||||
-rw-r--r-- | include/net/af_unix.h | 12 | ||||
-rw-r--r-- | include/net/xdp.h | 5 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 85 |
11 files changed, 250 insertions, 46 deletions
diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 26bf15e6cd35..5e62e2383b7f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,6 +4,7 @@ #include <asm/types.h> #include <linux/bits.h> +#include <linux/typecheck.h> #include <uapi/linux/kernel.h> @@ -253,6 +254,55 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, __clear_bit(nr, addr); } +/** + * __ptr_set_bit - Set bit in a pointer's value + * @nr: the bit to set + * @addr: the address of the pointer variable + * + * Example: + * void *p = foo(); + * __ptr_set_bit(bit, &p); + */ +#define __ptr_set_bit(nr, addr) \ + ({ \ + typecheck_pointer(*(addr)); \ + __set_bit(nr, (unsigned long *)(addr)); \ + }) + +/** + * __ptr_clear_bit - Clear bit in a pointer's value + * @nr: the bit to clear + * @addr: the address of the pointer variable + * + * Example: + * void *p = foo(); + * __ptr_clear_bit(bit, &p); + */ +#define __ptr_clear_bit(nr, addr) \ + ({ \ + typecheck_pointer(*(addr)); \ + __clear_bit(nr, (unsigned long *)(addr)); \ + }) + +/** + * __ptr_test_bit - Test bit in a pointer's value + * @nr: the bit to test + * @addr: the address of the pointer variable + * + * Example: + * void *p = foo(); + * if (__ptr_test_bit(bit, &p)) { + * ... + * } else { + * ... + * } + */ +#define __ptr_test_bit(nr, addr) \ + ({ \ + typecheck_pointer(*(addr)); \ + test_bit(nr, (unsigned long *)(addr)); \ + }) + #ifdef __KERNEL__ #ifndef set_mask_bits diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e8e2b0393ca9..0edff8f5177e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -168,6 +168,7 @@ struct bpf_map { u32 max_entries; u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ + int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; @@ -197,30 +198,53 @@ static inline bool map_value_has_spin_lock(const struct bpf_map *map) return map->spin_lock_off >= 0; } -static inline void check_and_init_map_lock(struct bpf_map *map, void *dst) +static inline bool map_value_has_timer(const struct bpf_map *map) { - if (likely(!map_value_has_spin_lock(map))) - return; - *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = - (struct bpf_spin_lock){}; + return map->timer_off >= 0; } -/* copy everything but bpf_spin_lock */ +static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +{ + if (unlikely(map_value_has_spin_lock(map))) + *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = + (struct bpf_spin_lock){}; + if (unlikely(map_value_has_timer(map))) + *(struct bpf_timer *)(dst + map->timer_off) = + (struct bpf_timer){}; +} + +/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { + u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0; + if (unlikely(map_value_has_spin_lock(map))) { - u32 off = map->spin_lock_off; + s_off = map->spin_lock_off; + s_sz = sizeof(struct bpf_spin_lock); + } else if (unlikely(map_value_has_timer(map))) { + t_off = map->timer_off; + t_sz = sizeof(struct bpf_timer); + } - memcpy(dst, src, off); - memcpy(dst + off + sizeof(struct bpf_spin_lock), - src + off + sizeof(struct bpf_spin_lock), - map->value_size - off - sizeof(struct bpf_spin_lock)); + if (unlikely(s_sz || t_sz)) { + if (s_off < t_off || !s_sz) { + swap(s_off, t_off); + swap(s_sz, t_sz); + } + memcpy(dst, src, t_off); + memcpy(dst + t_off + t_sz, + src + t_off + t_sz, + s_off - t_off - t_sz); + memcpy(dst + s_off + s_sz, + src + s_off + s_sz, + map->value_size - s_off - s_sz); } else { memcpy(dst, src, map->value_size); } } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); +void bpf_timer_cancel_and_free(void *timer); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; @@ -314,6 +338,7 @@ enum bpf_arg_type { ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ + ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, }; @@ -554,6 +579,11 @@ struct btf_func_model { */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) +/* Store IP address of the caller on the trampoline stack, + * so it's available for trampoline's programs. + */ +#define BPF_TRAMP_F_IP_ARG BIT(3) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ @@ -1509,12 +1539,12 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); -bool dev_map_can_have_prog(struct bpf_map *map); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); -bool cpu_map_prog_allowed(struct bpf_map *map); +int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, + struct sk_buff *skb); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1711,6 +1741,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, + struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + static inline bool cpu_map_prog_allowed(struct bpf_map *map) { return false; @@ -1852,6 +1888,12 @@ void bpf_map_offload_map_free(struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); +void sock_map_unhash(struct sock *sk); +void sock_map_close(struct sock *sk, long timeout); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1884,24 +1926,6 @@ static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, { return -ENOTSUPP; } -#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ - -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); -int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); -void sock_map_unhash(struct sock *sk); -void sock_map_close(struct sock *sk, long timeout); - -void bpf_sk_reuseport_detach(struct sock *sk); -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value); -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags); -#else -static inline void bpf_sk_reuseport_detach(struct sock *sk) -{ -} #ifdef CONFIG_BPF_SYSCALL static inline int sock_map_get_from_fd(const union bpf_attr *attr, @@ -1921,7 +1945,21 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ + +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} +#ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e774ecc1cd1f..b847e1ccd10f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -53,7 +53,14 @@ struct bpf_reg_state { /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ - struct bpf_map *map_ptr; + struct { + struct bpf_map *map_ptr; + /* To distinguish map lookups from outer map + * the map_uid is non-zero for registers + * pointing to inner maps. + */ + u32 map_uid; + }; /* for PTR_TO_BTF_ID */ struct { @@ -201,12 +208,19 @@ struct bpf_func_state { * zero == main subprog */ u32 subprogno; + /* Every bpf_timer_start will increment async_entry_cnt. + * It's used to distinguish: + * void foo(void) { for(;;); } + * void foo(void) { bpf_timer_set_callback(,foo); } + */ + u32 async_entry_cnt; + bool in_callback_fn; + bool in_async_callback_fn; /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; - bool in_callback_fn; struct bpf_stack_state *stack; }; @@ -392,6 +406,7 @@ struct bpf_subprog_info { bool has_tail_call; bool tail_call_reachable; bool has_ld_abs; + bool is_async_cb; }; /* single container for all structs diff --git a/include/linux/btf.h b/include/linux/btf.h index 94a0c976c90f..214fde93214b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -99,6 +99,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); +int btf_find_timer(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/include/linux/filter.h b/include/linux/filter.h index 472f97074da0..ba36989f711a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -559,7 +559,8 @@ struct bpf_prog { kprobe_override:1, /* Do we override a kprobe? */ has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ - call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_func_ip:1; /* Do we call get_func_ip() */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eaf5bb008aa9..42f6f866d5f3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3984,6 +3984,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); } +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog); void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2db9cd9a73f..f19190820e63 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -863,8 +863,8 @@ struct sk_buff { __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; #endif -#ifdef CONFIG_NET_REDIRECT __u8 redirected:1; +#ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif #ifdef CONFIG_TLS_DEVICE @@ -4664,17 +4664,13 @@ static inline __wsum lco_csum(struct sk_buff *skb) static inline bool skb_is_redirected(const struct sk_buff *skb) { -#ifdef CONFIG_NET_REDIRECT return skb->redirected; -#else - return false; -#endif } static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) { -#ifdef CONFIG_NET_REDIRECT skb->redirected = 1; +#ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) skb->tstamp = 0; @@ -4683,9 +4679,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) static inline void skb_reset_redirect(struct sk_buff *skb) { -#ifdef CONFIG_NET_REDIRECT skb->redirected = 0; -#endif } static inline bool skb_csum_is_sctp(struct sk_buff *skb) diff --git a/include/linux/typecheck.h b/include/linux/typecheck.h index 20d310331eb5..46b15e2aaefb 100644 --- a/include/linux/typecheck.h +++ b/include/linux/typecheck.h @@ -22,4 +22,13 @@ (void)__tmp; \ }) +/* + * Check at compile time that something is a pointer type. + */ +#define typecheck_pointer(x) \ +({ typeof(x) __dummy; \ + (void)sizeof(*__dummy); \ + 1; \ +}) + #endif /* TYPECHECK_H_INCLUDED */ diff --git a/include/net/af_unix.h b/include/net/af_unix.h index f42fdddecd41..435a2c3d5a6f 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -82,6 +82,8 @@ static inline struct unix_sock *unix_sk(const struct sock *sk) long unix_inq_len(struct sock *sk); long unix_outq_len(struct sock *sk); +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, + int flags); #ifdef CONFIG_SYSCTL int unix_sysctl_register(struct net *net); void unix_sysctl_unregister(struct net *net); @@ -89,4 +91,14 @@ void unix_sysctl_unregister(struct net *net); static inline int unix_sysctl_register(struct net *net) { return 0; } static inline void unix_sysctl_unregister(struct net *net) {} #endif + +#ifdef CONFIG_BPF_SYSCALL +extern struct proto unix_proto; + +int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +void __init unix_bpf_build_proto(void); +#else +static inline void __init unix_bpf_build_proto(void) +{} +#endif #endif diff --git a/include/net/xdp.h b/include/net/xdp.h index 5533f0ab2afc..ad5b02dcb6f4 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -276,6 +276,11 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) return unlikely(xdp->data_meta > xdp->data); } +static inline bool xdp_metalen_invalid(unsigned long metalen) +{ + return (metalen & (sizeof(__u32) - 1)) || (metalen > 32); +} + struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bf9252c7381e..2db6925e04f4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -324,9 +324,6 @@ union bpf_iter_link_info { * **BPF_PROG_TYPE_SK_LOOKUP** * *data_in* and *data_out* must be NULL. * - * **BPF_PROG_TYPE_XDP** - * *ctx_in* and *ctx_out* must be NULL. - * * **BPF_PROG_TYPE_RAW_TRACEPOINT**, * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** * @@ -3249,7 +3246,7 @@ union bpf_attr { * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a - * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. * It checks the selected socket is matching the incoming * request in the socket buffer. * Return @@ -4780,6 +4777,76 @@ union bpf_attr { * Execute close syscall for given FD. * Return * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing and kprobe programs). + * Return + * Address of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4951,6 +5018,11 @@ union bpf_attr { FN(sys_bpf), \ FN(btf_find_by_name_kind), \ FN(sys_close), \ + FN(timer_init), \ + FN(timer_set_callback), \ + FN(timer_start), \ + FN(timer_cancel), \ + FN(get_func_ip), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6077,6 +6149,11 @@ struct bpf_spin_lock { __u32 val; }; +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. |