diff options
73 files changed, 4339 insertions, 772 deletions
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 75a0dca5f295..5b613d2a5f1a 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -60,13 +60,13 @@ Q: Where can I find patches currently under discussion for BPF subsystem? A: All patches that are Cc'ed to netdev are queued for review under netdev patchwork project: - http://patchwork.ozlabs.org/project/netdev/list/ + https://patchwork.kernel.org/project/netdevbpf/list/ Those patches which target BPF, are assigned to a 'bpf' delegate for further processing from BPF maintainers. The current queue with patches under review can be found at: - https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147 + https://patchwork.kernel.org/project/netdevbpf/list/?delegate=121173 Once the patches have been reviewed by the BPF community as a whole and approved by the BPF maintainers, their status in patchwork will be diff --git a/MAINTAINERS b/MAINTAINERS index c80f87d7258c..78a465560513 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3263,7 +3263,7 @@ M: Daniel Borkmann <daniel@iogearbox.net> R: Martin KaFai Lau <kafai@fb.com> R: Song Liu <songliubraving@fb.com> R: Yonghong Song <yhs@fb.com> -R: Andrii Nakryiko <andriin@fb.com> +R: Andrii Nakryiko <andrii@kernel.org> R: John Fastabend <john.fastabend@gmail.com> R: KP Singh <kpsingh@chromium.org> L: netdev@vger.kernel.org diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 091e5b4ba042..8c737668008a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev) return smp_processor_id() % dev->real_num_rx_queues; } +static struct net_device *veth_peer_dev(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + /* Callers must be under RCU read side. */ + return rcu_dereference(priv->peer); +} + static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags, bool ndo_xmit) @@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, + .ndo_get_peer_dev = veth_peer_dev, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50e5c4b52bd1..2b16bf48aab6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -82,7 +82,7 @@ struct bpf_map_ops { void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); - u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); + int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); @@ -293,6 +293,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ + ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, }; @@ -307,6 +308,8 @@ enum bpf_return_type { RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -405,6 +408,7 @@ enum bpf_reg_type { PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ + PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ }; /* The information passed from prog-specific *_is_valid_access @@ -1828,6 +1832,8 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 363b4f1c562a..e83ef6f6bf43 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -308,6 +308,13 @@ struct bpf_insn_aux_data { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ }; + struct { + enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ + union { + u32 btf_id; /* btf_id for struct typed var */ + u32 mem_size; /* mem_size for non-struct typed var */ + }; + } btf_var; }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 024e16ff7dcc..2bf641829664 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -110,6 +110,11 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, datasec_type, member) \ + for (i = 0, member = btf_type_var_secinfo(datasec_type); \ + i < btf_type_vlen(datasec_type); \ + i++, member++) + static inline bool btf_type_is_ptr(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; @@ -145,6 +150,21 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; } +static inline bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static inline bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); @@ -179,6 +199,12 @@ static inline const struct btf_member *btf_type_member(const struct btf_type *t) return (const struct btf_member *)(t + 1); } +static inline const struct btf_var_secinfo *btf_type_var_secinfo( + const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a0df43b13839..948d7105e91a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1276,6 +1276,9 @@ struct netdev_net_notifier { * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. + * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); + * If a device is paired with a peer device, return the peer instance. + * The caller must be under RCU read context. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1483,6 +1486,7 @@ struct net_device_ops { struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); + struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); }; /** diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 3119928fc103..fec0c5ac1c4f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node); int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); diff --git a/include/net/tcp.h b/include/net/tcp.h index 3601dea931a6..d4ef5bf94168 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2228,34 +2228,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, #endif /* CONFIG_NET_SOCK_MSG */ #ifdef CONFIG_CGROUP_BPF -/* Copy the listen sk's HDR_OPT_CB flags to its child. - * - * During 3-Way-HandShake, the synack is usually sent from - * the listen sk with the HDR_OPT_CB flags set so that - * bpf-prog will be called to write the BPF hdr option. - * - * In fastopen, the child sk is used to send synack instead - * of the listen sk. Thus, inheriting the HDR_OPT_CB flags - * from the listen sk gives the bpf-prog a chance to write - * BPF hdr option in the synack pkt during fastopen. - * - * Both fastopen and non-fastopen child will inherit the - * HDR_OPT_CB flags to keep the bpf-prog having a consistent - * behavior when deciding to clear this cb flags (or not) - * during the PASSIVE_ESTABLISHED_CB. - * - * In the future, other cb flags could be inherited here also. - */ -static inline void bpf_skops_init_child(const struct sock *sk, - struct sock *child) -{ - tcp_sk(child)->bpf_sock_ops_cb_flags = - tcp_sk(sk)->bpf_sock_ops_cb_flags & - (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG | - BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | - BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); -} - static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) @@ -2264,11 +2236,6 @@ static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, skops->skb_data_end = skb->data + end_offset; } #else -static inline void bpf_skops_init_child(const struct sock *sk, - struct sock *child) -{ -} - static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f556cfcbfbe..bf5a99d803e4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -356,18 +356,36 @@ enum bpf_link_type { #define BPF_F_SLEEPABLE (1U << 4) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD + * insn[0].imm: map fd + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 +/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_VALUE 2 +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -417,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ @@ -1680,7 +1701,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return @@ -2235,7 +2256,7 @@ union bpf_attr { * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it + * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The @@ -3661,10 +3682,59 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. @@ -3823,6 +3893,9 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ + FN(bpf_per_cpu_ptr), \ + FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bd777dd6f967..c6c81eceb68f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -16,7 +16,7 @@ #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ - BPF_F_PRESERVE_ELEMS) + BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && - attr->map_flags & BPF_F_MMAPABLE) + attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && @@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ -static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; @@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { @@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) static bool array_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { - return meta0->max_entries == meta1->max_entries && - bpf_map_meta_equal(meta0, meta1); + if (!bpf_map_meta_equal(meta0, meta1)) + return false; + return meta0->map_flags & BPF_F_INNER_MAP ? true : + meta0->max_entries == meta1->max_entries; } struct bpf_iter_seq_array_map_info { @@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 array_of_map_gen_lookup(struct bpf_map *map, +static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4d0ee7839fdb..ed7d02e8bc93 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -188,11 +188,6 @@ i < btf_type_vlen(struct_type); \ i++, member++) -#define for_each_vsi(i, struct_type, member) \ - for (i = 0, member = btf_type_var_secinfo(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_vsi_from(i, from, struct_type, member) \ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -440,16 +435,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t) return !t || btf_type_nosize(t); } -/* union is only a special case of struct: - * all its offsetof(member) == 0 - */ -static bool btf_type_is_struct(const struct btf_type *t) -{ - u8 kind = BTF_INFO_KIND(t->info); - - return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; -} - static bool __btf_type_is_struct(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; @@ -460,11 +445,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_var(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; @@ -613,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t) return (const struct btf_var *)(t + 1); } -static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) -{ - return (const struct btf_var_secinfo *)(t + 1); -} - static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3395cf140d22..1815e97d4c9c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) * bpf_prog * __htab_map_lookup_elem */ -static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) return __htab_lru_map_lookup_elem(map, key, false); } -static u32 htab_lru_map_gen_lookup(struct bpf_map *map, +static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; @@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 htab_of_map_gen_lookup(struct bpf_map *map, +static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e825441781ab..25520f5eeaf6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -623,6 +623,34 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) +{ + if (cpu >= nr_cpu_ids) + return (unsigned long)NULL; + + return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); +} + +const struct bpf_func_proto bpf_per_cpu_ptr_proto = { + .func = bpf_per_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) +{ + return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); +} + +const struct bpf_func_proto bpf_this_cpu_ptr_proto = { + .func = bpf_this_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -689,6 +717,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index b367430e611c..3d897de89061 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s) raw_spin_lock_init(&head->lock); head->first = NULL; } + raw_spin_lock_init(&s->extralist.lock); + s->extralist.first = NULL; return 0; } @@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, raw_spin_unlock(&head->lock); } +static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + if (!raw_spin_trylock(&s->extralist.lock)) + return false; + + pcpu_freelist_push_node(&s->extralist, node); + raw_spin_unlock(&s->extralist.lock); + return true; +} + +static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + int cpu, orig_cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + struct pcpu_freelist_head *head; + + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + pcpu_freelist_push_node(head, node); + raw_spin_unlock(&head->lock); + return; + } + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + + /* cannot lock any per cpu lock, try extralist */ + if (cpu == orig_cpu && + pcpu_freelist_try_push_extra(s, node)) + return; + } +} + void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { - struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); - - ___pcpu_freelist_push(head, node); + if (in_nmi()) + ___pcpu_freelist_push_nmi(s, node); + else + ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); } void pcpu_freelist_push(struct pcpu_freelist *s, @@ -81,7 +121,7 @@ again: } } -struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; @@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) if (cpu >= nr_cpu_ids) cpu = 0; if (cpu == orig_cpu) - return NULL; + break; + } + + /* per cpu lists are all empty, try extralist */ + raw_spin_lock(&s->extralist.lock); + node = s->extralist.first; + if (node) + s->extralist.first = node->next; + raw_spin_unlock(&s->extralist.lock); + return node; +} + +static struct pcpu_freelist_node * +___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + } + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + break; } + + /* cannot pop from per cpu lists, try extralist */ + if (!raw_spin_trylock(&s->extralist.lock)) + return NULL; + node = s->extralist.first; + if (node) + s->extralist.first = node->next; + raw_spin_unlock(&s->extralist.lock); + return node; +} + +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +{ + if (in_nmi()) + return ___pcpu_freelist_pop_nmi(s); + return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index fbf8a8a28979..3c76553cfe57 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -13,6 +13,7 @@ struct pcpu_freelist_head { struct pcpu_freelist { struct pcpu_freelist_head __percpu *freelist; + struct pcpu_freelist_head extralist; }; struct pcpu_freelist_node { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f1528c2a6927..1110ecd7d1f3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4323,8 +4323,10 @@ static int bpf_prog_bind_map(union bpf_attr *attr) used_maps_old = prog->aux->used_maps; for (i = 0; i < prog->aux->used_map_cnt; i++) - if (used_maps_old[i] == map) + if (used_maps_old[i] == map) { + bpf_map_put(map); goto out_unlock; + } used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, sizeof(used_maps_new[0]), diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3923c570070b..c43a5e8f0818 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,8 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; + u32 btf_id; + u32 ret_btf_id; }; struct btf *btf_vmlinux; @@ -517,6 +519,7 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", [PTR_TO_RDONLY_BUF] = "rdonly_buf", @@ -583,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) + if (t == PTR_TO_BTF_ID || + t == PTR_TO_BTF_ID_OR_NULL || + t == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -2204,6 +2209,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: case PTR_TO_RDWR_BUF_OR_NULL: + case PTR_TO_PERCPU_BTF_ID: return true; default: return false; @@ -2221,6 +2227,20 @@ static bool register_is_const(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); } +static bool __is_scalar_unbounded(struct bpf_reg_state *reg) +{ + return tnum_is_unknown(reg->var_off) && + reg->smin_value == S64_MIN && reg->smax_value == S64_MAX && + reg->umin_value == 0 && reg->umax_value == U64_MAX && + reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX && + reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX; +} + +static bool register_is_bounded(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg); +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -2272,7 +2292,7 @@ static int check_stack_write(struct bpf_verifier_env *env, if (value_regno >= 0) reg = &cur->regs[value_regno]; - if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit @@ -2667,7 +2687,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; - /* fallthrough */ + fallthrough; /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: @@ -3978,6 +3998,7 @@ static const struct bpf_reg_types sock_types = { }, }; +#ifdef CONFIG_NET static const struct bpf_reg_types btf_id_sock_common_types = { .types = { PTR_TO_SOCK_COMMON, @@ -3988,6 +4009,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = { }, .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], }; +#endif static const struct bpf_reg_types mem_types = { .types = { @@ -4017,6 +4039,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4030,7 +4053,9 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CTX] = &context_types, [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, +#ifdef CONFIG_NET [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, +#endif [ARG_PTR_TO_SOCKET] = &fullsock_types, [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, @@ -4042,6 +4067,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4205,6 +4231,12 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + if (!reg->btf_id) { + verbose(env, "Helper has invalid btf_id in R%d\n", regno); + return -EACCES; + } + meta->ret_btf_id = reg->btf_id; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -5114,6 +5146,35 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + const struct btf_type *t; + + mark_reg_known_zero(env, regs, BPF_REG_0); + t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + if (!btf_type_is_struct(t)) { + u32 tsize; + const struct btf_type *ret; + const char *tname; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].mem_size = tsize; + } else { + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { int ret_btf_id; @@ -5432,7 +5493,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) break; - /* fall-through */ + fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: @@ -6389,6 +6450,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, src_reg = NULL; if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; + else + /* Make sure ID is cleared otherwise dst_reg min/max could be + * incorrectly propagated into other registers by find_equal_scalars() + */ + dst_reg->id = 0; if (BPF_SRC(insn->code) == BPF_X) { src_reg = ®s[insn->src_reg]; if (src_reg->type != SCALAR_VALUE) { @@ -6522,6 +6588,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R1 = R2 * copy register state to dest reg */ + if (src_reg->type == SCALAR_VALUE && !src_reg->id) + /* Assign src and dst registers the same ID + * that will be used by find_equal_scalars() + * to propagate min/max range. + */ + src_reg->id = ++env->id_gen; *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; @@ -6534,6 +6606,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; + /* Make sure ID is cleared otherwise + * dst_reg min/max could be incorrectly + * propagated into src_reg by find_equal_scalars() + */ + dst_reg->id = 0; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { @@ -7322,6 +7399,30 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, return true; } +static void find_equal_scalars(struct bpf_verifier_state *vstate, + struct bpf_reg_state *known_reg) +{ + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= vstate->curframe; i++) { + state = vstate->frame[i]; + for (j = 0; j < MAX_BPF_REG; j++) { + reg = &state->regs[j]; + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + } + + bpf_for_each_spilled_reg(j, state, reg) { + if (!reg) + continue; + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + } + } +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -7450,6 +7551,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], src_reg, dst_reg, opcode); + if (src_reg->id) { + find_equal_scalars(this_branch, src_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + } + } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], @@ -7457,6 +7563,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, opcode, is_jmp32); } + if (dst_reg->type == SCALAR_VALUE && dst_reg->id) { + find_equal_scalars(this_branch, dst_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); + } + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. @@ -7488,6 +7599,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *dst_reg; struct bpf_map *map; int err; @@ -7504,25 +7616,45 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + dst_reg = ®s[insn->dst_reg]; if (insn->src_reg == 0) { u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - regs[insn->dst_reg].type = SCALAR_VALUE; + dst_reg->type = SCALAR_VALUE; __mark_reg_known(®s[insn->dst_reg], imm); return 0; } + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { + mark_reg_known_zero(env, regs, insn->dst_reg); + + dst_reg->type = aux->btf_var.reg_type; + switch (dst_reg->type) { + case PTR_TO_MEM: + dst_reg->mem_size = aux->btf_var.mem_size; + break; + case PTR_TO_BTF_ID: + case PTR_TO_PERCPU_BTF_ID: + dst_reg->btf_id = aux->btf_var.btf_id; + break; + default: + verbose(env, "bpf verifier is misconfigured\n"); + return -EFAULT; + } + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); - regs[insn->dst_reg].map_ptr = map; + dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].off = aux->map_off; + dst_reg->type = PTR_TO_MAP_VALUE; + dst_reg->off = aux->map_off; if (map_value_has_spin_lock(map)) - regs[insn->dst_reg].id = ++env->id_gen; + dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + dst_reg->type = CONST_PTR_TO_MAP; } else { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -9424,6 +9556,92 @@ process_bpf_exit: return 0; } +/* replace pseudo btf_id with kernel symbol address */ +static int check_pseudo_btf_id(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_insn_aux_data *aux) +{ + u32 datasec_id, type, id = insn->imm; + const struct btf_var_secinfo *vsi; + const struct btf_type *datasec; + const struct btf_type *t; + const char *sym_name; + bool percpu = false; + u64 addr; + int i; + + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + + if (insn[1].imm != 0) { + verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, id); + if (!t) { + verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); + return -ENOENT; + } + + if (!btf_type_is_var(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", + id); + return -EINVAL; + } + + sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + addr = kallsyms_lookup_name(sym_name); + if (!addr) { + verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", + sym_name); + return -ENOENT; + } + + datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", + BTF_KIND_DATASEC); + if (datasec_id > 0) { + datasec = btf_type_by_id(btf_vmlinux, datasec_id); + for_each_vsi(i, datasec, vsi) { + if (vsi->type == id) { + percpu = true; + break; + } + } + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + type = t->type; + t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + if (percpu) { + aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf_id = type; + } else if (!btf_type_is_struct(t)) { + const struct btf_type *ret; + const char *tname; + u32 tsize; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.mem_size = tsize; + } else { + aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf_id = type; + } + return 0; +} + static int check_map_prealloc(struct bpf_map *map) { return (map->map_type != BPF_MAP_TYPE_HASH && @@ -9534,10 +9752,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map) map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); } -/* look for pseudo eBPF instructions that access map FDs and - * replace them with actual map pointers +/* find and rewrite pseudo imm in ld_imm64 instructions: + * + * 1. if it accesses map FD, replace it with actual map pointer. + * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. + * + * NOTE: btf_vmlinux is required for converting pseudo btf_id. */ -static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) +static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -9578,6 +9800,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; + if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) { + aux = &env->insn_aux_data[i]; + err = check_pseudo_btf_id(env, insn, aux); + if (err) + return err; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10819,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == -EOPNOTSUPP) + goto patch_map_ops_generic; + if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -10849,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); - +patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - @@ -11633,10 +11865,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - ret = replace_map_fd_with_map_ptr(env); - if (ret < 0) - goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) @@ -11662,6 +11890,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret) goto skip_full_check; + ret = resolve_pseudo_ldimm64(env); + if (ret < 0) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e118a83439c3..a2a4535b6277 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1327,6 +1327,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: return NULL; } @@ -1776,7 +1780,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { +#ifdef CONFIG_NET .test_run = bpf_prog_test_run_raw_tp, +#endif }; const struct bpf_verifier_ops tracing_verifier_ops = { diff --git a/net/core/dev.c b/net/core/dev.c index a146bac84320..22babf5bcc88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); static inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) + struct net_device *orig_dev, bool *another) { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); @@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, * redirecting to another netdev */ __skb_push(skb, skb->mac_len); - skb_do_redirect(skb); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } return NULL; case TC_ACT_CONSUMED: return NULL; @@ -5163,7 +5167,12 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { - skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); + bool another = false; + + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; if (!skb) goto out; diff --git a/net/core/filter.c b/net/core/filter.c index bc6bd2b323e8..c5e2a1c5fd8d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -76,6 +76,7 @@ #include <net/bpf_sk_storage.h> #include <net/transp_v6.h> #include <linux/btf_ids.h> +#include <net/tls.h> static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -2379,8 +2380,9 @@ out: /* Internal, non-exposed redirect flags. */ enum { - BPF_F_NEIGH = (1ULL << 1), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) + BPF_F_NEIGH = (1ULL << 1), + BPF_F_PEER = (1ULL << 2), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2429,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); + dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; - if (unlikely(!dev)) { - kfree_skb(skb); - return -EINVAL; + ri->flags = 0; + if (unlikely(!dev)) + goto out_drop; + if (flags & BPF_F_PEER) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (unlikely(!ops->ndo_get_peer_dev || + !skb_at_tc_ingress(skb))) + goto out_drop; + dev = ops->ndo_get_peer_dev(dev); + if (unlikely(!dev || + !is_skb_forwardable(dev, skb) || + net_eq(net, dev_net(dev)))) + goto out_drop; + skb->dev = dev; + return -EAGAIN; } - return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev) : __bpf_redirect(skb, dev, flags); +out_drop: + kfree_skb(skb); + return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) @@ -2465,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_PEER; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_peer_proto = { + .func = bpf_redirect_peer, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -3479,6 +3518,48 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) SKB_MAX_ALLOC; } +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) +{ + u32 len_diff_abs = abs(len_diff); + bool shrink = len_diff < 0; + int ret = 0; + + if (unlikely(flags || mode)) + return -EINVAL; + if (unlikely(len_diff_abs > 0xfffU)) + return -EFAULT; + + if (!shrink) { + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + __skb_push(skb, len_diff_abs); + memset(skb->data, 0, len_diff_abs); + } else { + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) + return -ENOMEM; + __skb_pull(skb, len_diff_abs); + } + bpf_compute_data_end_sk_skb(skb); + if (tls_sw_has_ctx_rx(skb->sk)) { + struct strp_msg *rxm = strp_msg(skb); + + rxm->full_len += len_diff; + } + return ret; +} + +static const struct bpf_func_proto sk_skb_adjust_room_proto = { + .func = sk_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { @@ -4784,6 +4865,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else icsk->icsk_user_timeout = val; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: ret = -EINVAL; } @@ -5149,7 +5234,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; - params->ifindex = dev->ifindex; return 0; } @@ -5246,6 +5330,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here @@ -5371,6 +5456,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. @@ -6745,6 +6831,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_change_tail || func == sk_skb_change_tail || func == bpf_skb_adjust_room || + func == sk_skb_adjust_room || func == bpf_skb_pull_data || func == sk_skb_pull_data || func == bpf_clone_redirect || @@ -7005,6 +7092,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; + case BPF_FUNC_redirect_peer: + return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: @@ -7218,6 +7307,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; + case BPF_FUNC_skb_adjust_room: + return &sk_skb_adjust_room_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4b5f7c8fecd1..654182ecf87b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -433,10 +433,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { - if (ingress) - return sk_psock_skb_ingress(psock, skb); - else + if (!ingress) { + if (!sock_writeable(psock->sk)) + return -EAGAIN; return skb_send_sock_locked(psock->sk, skb, off, len); + } + return sk_psock_skb_ingress(psock, skb); } static void sk_psock_backlog(struct work_struct *work) @@ -625,6 +627,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.skb_parser) sk_psock_stop_strp(sk, psock); + else if (psock->progs.skb_verdict) + sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -682,19 +686,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - int ret; - - skb->sk = psock->sk; bpf_compute_data_end_sk_skb(skb); - ret = bpf_prog_run_pin_on_cpu(prog, skb); - /* strparser clones the skb before handing it to a upper layer, - * meaning skb_orphan has been called. We NULL sk on the way out - * to ensure we don't trigger a BUG_ON() in skb/sk operations - * later and because we are not charging the memory of this skb - * to any socket yet. - */ - skb->sk = NULL; - return ret; + return bpf_prog_run_pin_on_cpu(prog, skb); } static struct sk_psock *sk_psock_from_strp(struct strparser *strp) @@ -709,38 +702,35 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; - bool ingress; sk_other = tcp_skb_bpf_redirect_fetch(skb); + /* This error is a buggy BPF program, it returned a redirect + * return code, but then didn't set a redirect interface. + */ if (unlikely(!sk_other)) { kfree_skb(skb); return; } psock_other = sk_psock(sk_other); + /* This error indicates the socket is being torn down or had another + * error that caused the pipe to break. We can't send a packet on + * a socket that is in this state so we drop the skb. + */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { kfree_skb(skb); return; } - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - } else { - kfree_skb(skb); - } + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); } -static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) { switch (verdict) { case __SK_REDIRECT: + skb_set_owner_r(skb, sk); sk_psock_skb_redirect(skb); break; case __SK_PASS: @@ -758,11 +748,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { + /* We skip full set_owner_r here because if we do a SK_PASS + * or SK_DROP we can skip skb memory accounting and use the + * TLS context. + */ + skb->sk = psock->sk; tcp_skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb->sk = NULL; } - sk_psock_tls_verdict_apply(skb, ret); + sk_psock_tls_verdict_apply(skb, psock->sk, ret); rcu_read_unlock(); return ret; } @@ -771,7 +767,9 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { + struct tcp_skb_cb *tcp; struct sock *sk_other; + int err = -EIO; switch (verdict) { case __SK_PASS: @@ -780,16 +778,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { goto out_free; } - if (atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf) { - struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + tcp = TCP_SKB_CB(skb); + tcp->bpf.flags |= BPF_F_INGRESS; + + /* If the queue is empty then we can submit directly + * into the msg queue. If its not empty we have to + * queue work otherwise we may get OOO data. Otherwise, + * if sk_psock_skb_ingress errors will be handled by + * retrying later from workqueue. + */ + if (skb_queue_empty(&psock->ingress_skb)) { + err = sk_psock_skb_ingress(psock, skb); + } + if (err < 0) { skb_queue_tail(&psock->ingress_skb, skb); schedule_work(&psock->work); - break; } - goto out_free; + break; case __SK_REDIRECT: sk_psock_skb_redirect(skb); break; @@ -814,9 +820,9 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) kfree_skb(skb); goto out; } + skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - skb_orphan(skb); tcp_skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); @@ -839,8 +845,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_parser); - if (likely(prog)) + if (likely(prog)) { + skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); + skb->sk = NULL; + } rcu_read_unlock(); return ret; } @@ -864,6 +873,57 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t orig_len) +{ + struct sock *sk = (struct sock *)desc->arg.data; + struct sk_psock *psock; + struct bpf_prog *prog; + int ret = __SK_DROP; + int len = skb->len; + + /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) { + desc->error = -ENOMEM; + return 0; + } + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + len = 0; + kfree_skb(skb); + goto out; + } + skb_set_owner_r(skb, sk); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + sk_psock_verdict_apply(psock, skb, ret); +out: + rcu_read_unlock(); + return len; +} + +static void sk_psock_verdict_data_ready(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + read_descriptor_t desc; + + if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + return; + + desc.arg.data = sk; + desc.error = 0; + desc.count = 1; + + sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); +} + static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; @@ -893,6 +953,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) return strp_init(&psock->parser.strp, sk, &cb); } +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_verdict_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -918,3 +991,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) strp_stop(&parser->strp); parser->enabled = false; } + +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + parser->enabled = false; +} diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e83a80e8f13b..df09c39a4dd2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock, static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { + bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; - bool strp_stop = false; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { @@ -159,14 +159,19 @@ static void sock_map_del_link(struct sock *sk, map); if (psock->parser.enabled && stab->progs.skb_parser) strp_stop = true; + if (psock->parser.enabled && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); - if (strp_stop) { + if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); - sk_psock_stop_strp(sk, psock); + if (strp_stop) + sk_psock_stop_strp(sk, psock); + else + sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); } } @@ -230,16 +235,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, { struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; struct sk_psock *psock; - bool skb_progs; int ret; skb_verdict = READ_ONCE(progs->skb_verdict); skb_parser = READ_ONCE(progs->skb_parser); - skb_progs = skb_parser && skb_verdict; - if (skb_progs) { + if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) return PTR_ERR(skb_verdict); + } + if (skb_parser) { skb_parser = bpf_prog_inc_not_zero(skb_parser); if (IS_ERR(skb_parser)) { bpf_prog_put(skb_verdict); @@ -264,7 +269,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_progs && READ_ONCE(psock->progs.skb_parser))) { + (skb_parser && READ_ONCE(psock->progs.skb_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -285,28 +291,31 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_progs && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->parser.enabled) { ret = sk_psock_init_strp(sk, psock); - if (ret) { - write_unlock_bh(&sk->sk_callback_lock); - goto out_drop; - } + if (ret) + goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); + } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; +out_unlock_drop: + write_unlock_bh(&sk->sk_callback_lock); out_drop: sk_psock_put(sk, psock); out_progs: if (msg_parser) bpf_prog_put(msg_parser); out: - if (skb_progs) { + if (skb_verdict) bpf_prog_put(skb_verdict); + if (skb_parser) bpf_prog_put(skb_parser); - } return ret; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 56c306e3cd2f..495dda2449fe 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -548,7 +548,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); - bpf_skops_init_child(sk, newsk); tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index e63fadd000db..64c9e55d4d4e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -3,9 +3,6 @@ #include <net/xsk_buff_pool.h> #include <net/xdp_sock.h> #include <net/xdp_sock_drv.h> -#include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> -#include <linux/swiotlb.h> #include "xsk_queue.h" #include "xdp_umem.h" diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index dc1dd5ef70d1..cdb9cf3cd136 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -15,6 +15,10 @@ struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; + /* Hinder the adjacent cache prefetcher to prefetch the consumer + * pointer if the producer pointer is touched and vice versa. + */ + u32 pad ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; u32 flags; }; diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 0c5df593bc56..49da2b8ace8b 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; struct bpf_insn *insn = insn_buf; diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4f1ed0e3cf9f..aeebf5d12f32 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -98,8 +98,8 @@ test_map_in_map-objs := test_map_in_map_user.o per_socket_stats_example-objs := cookie_uid_helper_example.o xdp_redirect-objs := xdp_redirect_user.o xdp_redirect_map-objs := xdp_redirect_map_user.o -xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o -xdp_monitor-objs := bpf_load.o xdp_monitor_user.o +xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o +xdp_monitor-objs := xdp_monitor_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := syscall_tp_user.o cpustat-objs := cpustat_user.o @@ -211,6 +211,8 @@ TPROGLDLIBS_xsk_fwd += -pthread # make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang LLC ?= llc CLANG ?= clang +OPT ?= opt +LLVM_DIS ?= llvm-dis LLVM_OBJCOPY ?= llvm-objcopy BTF_PAHOLE ?= pahole @@ -303,6 +305,11 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is # useless for BPF samples. +# below we use long chain of commands, clang | opt | llvm-dis | llc, +# to generate final object file. 'clang' compiles the source into IR +# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin +# processing (llvm12) and IR optimizations. 'llvm-dis' converts +# 'opt' output to IR, and finally 'llc' generates bpf byte code. $(obj)/%.o: $(src)/%.c @echo " CLANG-bpf " $@ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ @@ -314,7 +321,9 @@ $(obj)/%.o: $(src)/%.c -Wno-address-of-packed-member -Wno-tautological-compare \ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ - -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ + -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ + $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ endif diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index 4b22ace52f80..ff4c533dfac2 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -40,6 +40,7 @@ #include <errno.h> #include <fcntl.h> #include <linux/unistd.h> +#include <linux/compiler.h> #include <linux/bpf.h> #include <bpf/bpf.h> @@ -483,7 +484,7 @@ int main(int argc, char **argv) "Option -%c requires an argument.\n\n", optopt); case 'h': - fallthrough; + __fallthrough; default: Usage(); return 0; diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 3d33cca2d48a..5c955b812c47 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -6,21 +6,21 @@ #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> -struct bpf_map_def SEC("maps") redirect_err_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 2, +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 2); /* TODO: have entries for all possible errno's */ -}; +} redirect_err_cnt SEC(".maps"); #define XDP_UNKNOWN XDP_REDIRECT + 1 -struct bpf_map_def SEC("maps") exception_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = XDP_UNKNOWN + 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, XDP_UNKNOWN + 1); +} exception_cnt SEC(".maps"); /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format * Code in: kernel/include/trace/events/xdp.h @@ -129,19 +129,19 @@ struct datarec { }; #define MAX_CPUS 64 -struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = MAX_CPUS, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, MAX_CPUS); +} cpumap_enqueue_cnt SEC(".maps"); -struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} cpumap_kthread_cnt SEC(".maps"); /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format * Code in: kernel/include/trace/events/xdp.h @@ -210,12 +210,12 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) return 0; } -struct bpf_map_def SEC("maps") devmap_xmit_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} devmap_xmit_cnt SEC(".maps"); /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format * Code in: kernel/include/trace/events/xdp.h diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index ef53b93db573..03d0a182913f 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -26,12 +26,37 @@ static const char *__doc_err_only__= #include <net/if.h> #include <time.h> +#include <signal.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "bpf_util.h" +enum map_type { + REDIRECT_ERR_CNT, + EXCEPTION_CNT, + CPUMAP_ENQUEUE_CNT, + CPUMAP_KTHREAD_CNT, + DEVMAP_XMIT_CNT, +}; + +static const char *const map_type_strings[] = { + [REDIRECT_ERR_CNT] = "redirect_err_cnt", + [EXCEPTION_CNT] = "exception_cnt", + [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", + [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", + [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt", +}; + +#define NUM_MAP 5 +#define NUM_TP 8 + +static int tp_cnt; +static int map_cnt; static int verbose = 1; static bool debug = false; +struct bpf_map *map_data[NUM_MAP] = {}; +struct bpf_link *tp_links[NUM_TP] = {}; +struct bpf_object *obj; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -41,6 +66,16 @@ static const struct option long_options[] = { {0, 0, NULL, 0 } }; +static void int_exit(int sig) +{ + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); + exit(0); +} + /* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ #define EXIT_FAIL_MEM 5 @@ -483,23 +518,23 @@ static bool stats_collect(struct stats_record *rec) * this can happen by someone running perf-record -e */ - fd = map_data[0].fd; /* map0: redirect_err_cnt */ + fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]); for (i = 0; i < REDIR_RES_MAX; i++) map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); - fd = map_data[1].fd; /* map1: exception_cnt */ + fd = bpf_map__fd(map_data[EXCEPTION_CNT]); for (i = 0; i < XDP_ACTION_MAX; i++) { map_collect_record_u64(fd, i, &rec->xdp_exception[i]); } - fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */ + fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]); for (i = 0; i < MAX_CPUS; i++) map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); - fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ + fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]); map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); - fd = map_data[4].fd; /* map4: devmap_xmit_cnt */ + fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]); map_collect_record(fd, 0, &rec->xdp_devmap_xmit); return true; @@ -598,8 +633,8 @@ static void stats_poll(int interval, bool err_only) /* TODO Need more advanced stats on error types */ if (verbose) { - printf(" - Stats map0: %s\n", map_data[0].name); - printf(" - Stats map1: %s\n", map_data[1].name); + printf(" - Stats map0: %s\n", bpf_map__name(map_data[0])); + printf(" - Stats map1: %s\n", bpf_map__name(map_data[1])); printf("\n"); } fflush(stdout); @@ -618,44 +653,51 @@ static void stats_poll(int interval, bool err_only) static void print_bpf_prog_info(void) { - int i; + struct bpf_program *prog; + struct bpf_map *map; + int i = 0; /* Prog info */ - printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt); - for (i = 0; i < prog_cnt; i++) { - printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]); + printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt); + bpf_object__for_each_program(prog, obj) { + printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog)); + i++; } + i = 0; /* Maps info */ - printf("Loaded BPF prog have %d map(s)\n", map_data_count); - for (i = 0; i < map_data_count; i++) { - char *name = map_data[i].name; - int fd = map_data[i].fd; + printf("Loaded BPF prog have %d map(s)\n", map_cnt); + bpf_object__for_each_map(map, obj) { + const char *name = bpf_map__name(map); + int fd = bpf_map__fd(map); printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name); + i++; } /* Event info */ - printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt); - for (i = 0; i < prog_cnt; i++) { - if (event_fd[i] != -1) - printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]); + printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt); + for (i = 0; i < tp_cnt; i++) { + int fd = bpf_link__fd(tp_links[i]); + + if (fd != -1) + printf(" - event_fd[%d] = fd(%d)\n", i, fd); } } int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; int longindex = 0, opt; - int ret = EXIT_SUCCESS; - char bpf_obj_file[256]; + int ret = EXIT_FAILURE; + enum map_type type; + char filename[256]; /* Default settings: */ bool errors_only = true; int interval = 2; - snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]); - /* Parse commands line args */ while ((opt = getopt_long(argc, argv, "hDSs:", long_options, &longindex)) != -1) { @@ -672,40 +714,79 @@ int main(int argc, char **argv) case 'h': default: usage(argv); - return EXIT_FAILURE; + return ret; } } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK)"); - return EXIT_FAILURE; + return ret; } - if (load_bpf_file(bpf_obj_file)) { - printf("ERROR - bpf_log_buf: %s", bpf_log_buf); - return EXIT_FAILURE; + /* Remove tracepoint program when program is interrupted or killed */ + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + printf("ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (type = 0; type < NUM_MAP; type++) { + map_data[type] = + bpf_object__find_map_by_name(obj, map_type_strings[type]); + + if (libbpf_get_error(map_data[type])) { + printf("ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + map_cnt++; } - if (!prog_fd[0]) { - printf("ERROR - load_bpf_file: %s\n", strerror(errno)); - return EXIT_FAILURE; + + bpf_object__for_each_program(prog, obj) { + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + printf("ERROR: bpf_program__attach failed\n"); + tp_links[tp_cnt] = NULL; + goto cleanup; + } + tp_cnt++; } if (debug) { print_bpf_prog_info(); } - /* Unload/stop tracepoint event by closing fd's */ + /* Unload/stop tracepoint event by closing bpf_link's */ if (errors_only) { - /* The prog_fd[i] and event_fd[i] depend on the - * order the functions was defined in _kern.c + /* The bpf_link[i] depend on the order of + * the functions was defined in _kern.c */ - close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */ - close(prog_fd[2]); /* func: trace_xdp_redirect */ - close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */ - close(prog_fd[3]); /* func: trace_xdp_redirect_map */ + bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */ + tp_links[2] = NULL; + + bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */ + tp_links[3] = NULL; } stats_poll(interval, errors_only); + ret = EXIT_SUCCESS; + +cleanup: + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); return ret; } diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 3dd366e9474d..6fb8dbde62c5 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -37,18 +37,35 @@ static __u32 prog_id; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static int n_cpus; -static int cpu_map_fd; -static int rx_cnt_map_fd; -static int redirect_err_cnt_map_fd; -static int cpumap_enqueue_cnt_map_fd; -static int cpumap_kthread_cnt_map_fd; -static int cpus_available_map_fd; -static int cpus_count_map_fd; -static int cpus_iterator_map_fd; -static int exception_cnt_map_fd; + +enum map_type { + CPU_MAP, + RX_CNT, + REDIRECT_ERR_CNT, + CPUMAP_ENQUEUE_CNT, + CPUMAP_KTHREAD_CNT, + CPUS_AVAILABLE, + CPUS_COUNT, + CPUS_ITERATOR, + EXCEPTION_CNT, +}; + +static const char *const map_type_strings[] = { + [CPU_MAP] = "cpu_map", + [RX_CNT] = "rx_cnt", + [REDIRECT_ERR_CNT] = "redirect_err_cnt", + [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", + [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", + [CPUS_AVAILABLE] = "cpus_available", + [CPUS_COUNT] = "cpus_count", + [CPUS_ITERATOR] = "cpus_iterator", + [EXCEPTION_CNT] = "exception_cnt", +}; #define NUM_TP 5 -struct bpf_link *tp_links[NUM_TP] = { 0 }; +#define NUM_MAP 9 +struct bpf_link *tp_links[NUM_TP] = {}; +static int map_fds[NUM_MAP]; static int tp_cnt = 0; /* Exit return codes */ @@ -527,20 +544,20 @@ static void stats_collect(struct stats_record *rec) { int fd, i; - fd = rx_cnt_map_fd; + fd = map_fds[RX_CNT]; map_collect_percpu(fd, 0, &rec->rx_cnt); - fd = redirect_err_cnt_map_fd; + fd = map_fds[REDIRECT_ERR_CNT]; map_collect_percpu(fd, 1, &rec->redir_err); - fd = cpumap_enqueue_cnt_map_fd; + fd = map_fds[CPUMAP_ENQUEUE_CNT]; for (i = 0; i < n_cpus; i++) map_collect_percpu(fd, i, &rec->enq[i]); - fd = cpumap_kthread_cnt_map_fd; + fd = map_fds[CPUMAP_KTHREAD_CNT]; map_collect_percpu(fd, 0, &rec->kthread); - fd = exception_cnt_map_fd; + fd = map_fds[EXCEPTION_CNT]; map_collect_percpu(fd, 0, &rec->exception); } @@ -565,7 +582,7 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, /* Add a CPU entry to cpumap, as this allocate a cpu entry in * the kernel for the cpu. */ - ret = bpf_map_update_elem(cpu_map_fd, &cpu, value, 0); + ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0); if (ret) { fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); exit(EXIT_FAIL_BPF); @@ -574,21 +591,21 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, /* Inform bpf_prog's that a new CPU is available to select * from via some control maps. */ - ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0); + ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0); if (ret) { fprintf(stderr, "Add to avail CPUs failed\n"); exit(EXIT_FAIL_BPF); } /* When not replacing/updating existing entry, bump the count */ - ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count); + ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count); if (ret) { fprintf(stderr, "Failed reading curr cpus_count\n"); exit(EXIT_FAIL_BPF); } if (new) { curr_cpus_count++; - ret = bpf_map_update_elem(cpus_count_map_fd, &key, + ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count, 0); if (ret) { fprintf(stderr, "Failed write curr cpus_count\n"); @@ -612,7 +629,7 @@ static void mark_cpus_unavailable(void) int ret, i; for (i = 0; i < n_cpus; i++) { - ret = bpf_map_update_elem(cpus_available_map_fd, &i, + ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i, &invalid_cpu, 0); if (ret) { fprintf(stderr, "Failed marking CPU unavailable\n"); @@ -665,68 +682,37 @@ static void stats_poll(int interval, bool use_separators, char *prog_name, free_stats_record(prev); } -static struct bpf_link * attach_tp(struct bpf_object *obj, - const char *tp_category, - const char* tp_name) +static int init_tracepoints(struct bpf_object *obj) { struct bpf_program *prog; - struct bpf_link *link; - char sec_name[PATH_MAX]; - int len; - len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s", - tp_category, tp_name); - if (len < 0) - exit(EXIT_FAIL); + bpf_object__for_each_program(prog, obj) { + if (bpf_program__is_tracepoint(prog) != true) + continue; - prog = bpf_object__find_program_by_title(obj, sec_name); - if (!prog) { - fprintf(stderr, "ERR: finding progsec: %s\n", sec_name); - exit(EXIT_FAIL_BPF); + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + tp_links[tp_cnt] = NULL; + return -EINVAL; + } + tp_cnt++; } - link = bpf_program__attach_tracepoint(prog, tp_category, tp_name); - if (libbpf_get_error(link)) - exit(EXIT_FAIL_BPF); - - return link; -} - -static void init_tracepoints(struct bpf_object *obj) { - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread"); + return 0; } static int init_map_fds(struct bpf_object *obj) { - /* Maps updated by tracepoints */ - redirect_err_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt"); - exception_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "exception_cnt"); - cpumap_enqueue_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt"); - cpumap_kthread_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt"); - - /* Maps used by XDP */ - rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt"); - cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map"); - cpus_available_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpus_available"); - cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count"); - cpus_iterator_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpus_iterator"); - - if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 || - redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 || - cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 || - cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 || - exception_cnt_map_fd < 0) - return -ENOENT; + enum map_type type; + + for (type = 0; type < NUM_MAP; type++) { + map_fds[type] = + bpf_object__find_map_fd_by_name(obj, + map_type_strings[type]); + + if (map_fds[type] < 0) + return -ENOENT; + } return 0; } @@ -795,13 +781,13 @@ int main(int argc, char **argv) bool stress_mode = false; struct bpf_program *prog; struct bpf_object *obj; + int err = EXIT_FAIL; char filename[256]; int added_cpus = 0; int longindex = 0; int interval = 2; int add_cpu = -1; - int opt, err; - int prog_fd; + int opt, prog_fd; int *cpu, i; __u32 qsize; @@ -824,24 +810,29 @@ int main(int argc, char **argv) } if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return EXIT_FAIL; + return err; if (prog_fd < 0) { fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", strerror(errno)); - return EXIT_FAIL; + return err; } - init_tracepoints(obj); + + if (init_tracepoints(obj) < 0) { + fprintf(stderr, "ERR: bpf_program__attach failed\n"); + return err; + } + if (init_map_fds(obj) < 0) { fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n"); - return EXIT_FAIL; + return err; } mark_cpus_unavailable(); cpu = malloc(n_cpus * sizeof(int)); if (!cpu) { fprintf(stderr, "failed to allocate cpu array\n"); - return EXIT_FAIL; + return err; } memset(cpu, 0, n_cpus * sizeof(int)); @@ -960,14 +951,12 @@ int main(int argc, char **argv) prog = bpf_object__find_program_by_title(obj, prog_name); if (!prog) { fprintf(stderr, "bpf_object__find_program_by_title failed\n"); - err = EXIT_FAIL; goto out; } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { fprintf(stderr, "bpf_program__fd failed\n"); - err = EXIT_FAIL; goto out; } @@ -986,6 +975,8 @@ int main(int argc, char **argv) stats_poll(interval, use_separators, prog_name, mprog_name, &value, stress_mode); + + err = EXIT_OK; out: free(cpu); return err; diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c index 33377289e2a8..9cf76b340dd7 100644 --- a/samples/bpf/xdp_sample_pkts_kern.c +++ b/samples/bpf/xdp_sample_pkts_kern.c @@ -5,14 +5,12 @@ #include <bpf/bpf_helpers.h> #define SAMPLE_SIZE 64ul -#define MAX_CPUS 128 - -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = MAX_CPUS, -}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); +} my_map SEC(".maps"); SEC("xdp_sample") int xdp_sample_prog(struct xdp_md *ctx) diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c index 991ef6f0880b..4b2a300c750c 100644 --- a/samples/bpf/xdp_sample_pkts_user.c +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -18,7 +18,6 @@ #include "perf-sys.h" -#define MAX_CPUS 128 static int if_idx; static char *if_name; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index b220173dbe1e..1149e94ca32f 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -11,6 +11,7 @@ #include <linux/if_xdp.h> #include <linux/if_ether.h> #include <linux/ip.h> +#include <linux/limits.h> #include <linux/udp.h> #include <arpa/inet.h> #include <locale.h> @@ -79,6 +80,10 @@ static u16 opt_pkt_size = MIN_PKT_SIZE; static u32 opt_pkt_fill_pattern = 0x12345678; static bool opt_extra_stats; static bool opt_quiet; +static bool opt_app_stats; +static const char *opt_irq_str = ""; +static u32 irq_no; +static int irqs_at_init = -1; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -91,18 +96,7 @@ static bool opt_need_wakeup = true; static u32 opt_num_xsks = 1; static u32 prog_id; -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct xsk_umem *umem; - void *buffer; -}; - -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; +struct xsk_ring_stats { unsigned long rx_npkts; unsigned long tx_npkts; unsigned long rx_dropped_npkts; @@ -119,6 +113,41 @@ struct xsk_socket_info { unsigned long prev_rx_full_npkts; unsigned long prev_rx_fill_empty_npkts; unsigned long prev_tx_empty_npkts; +}; + +struct xsk_driver_stats { + unsigned long intrs; + unsigned long prev_intrs; +}; + +struct xsk_app_stats { + unsigned long rx_empty_polls; + unsigned long fill_fail_polls; + unsigned long copy_tx_sendtos; + unsigned long tx_wakeup_sendtos; + unsigned long opt_polls; + unsigned long prev_rx_empty_polls; + unsigned long prev_fill_fail_polls; + unsigned long prev_copy_tx_sendtos; + unsigned long prev_tx_wakeup_sendtos; + unsigned long prev_opt_polls; +}; + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + void *buffer; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + struct xsk_ring_stats ring_stats; + struct xsk_app_stats app_stats; + struct xsk_driver_stats drv_stats; u32 outstanding_tx; }; @@ -173,18 +202,151 @@ static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk) return err; if (optlen == sizeof(struct xdp_statistics)) { - xsk->rx_dropped_npkts = stats.rx_dropped; - xsk->rx_invalid_npkts = stats.rx_invalid_descs; - xsk->tx_invalid_npkts = stats.tx_invalid_descs; - xsk->rx_full_npkts = stats.rx_ring_full; - xsk->rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; - xsk->tx_empty_npkts = stats.tx_ring_empty_descs; + xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped; + xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs; + xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs; + xsk->ring_stats.rx_full_npkts = stats.rx_ring_full; + xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; + xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs; return 0; } return -EINVAL; } +static void dump_app_stats(long dt) +{ + int i; + + for (i = 0; i < num_socks && xsks[i]; i++) { + char *fmt = "%-18s %'-14.0f %'-14lu\n"; + double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps, + tx_wakeup_sendtos_ps, opt_polls_ps; + + rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls - + xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt; + fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls - + xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt; + copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos - + xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt; + tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos - + xsks[i]->app_stats.prev_tx_wakeup_sendtos) + * 1000000000. / dt; + opt_polls_ps = (xsks[i]->app_stats.opt_polls - + xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt; + + printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count"); + printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls); + printf(fmt, "fill fail polls", fill_fail_polls_ps, + xsks[i]->app_stats.fill_fail_polls); + printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps, + xsks[i]->app_stats.copy_tx_sendtos); + printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps, + xsks[i]->app_stats.tx_wakeup_sendtos); + printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls); + + xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls; + xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls; + xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos; + xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos; + xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls; + } +} + +static bool get_interrupt_number(void) +{ + FILE *f_int_proc; + char line[4096]; + bool found = false; + + f_int_proc = fopen("/proc/interrupts", "r"); + if (f_int_proc == NULL) { + printf("Failed to open /proc/interrupts.\n"); + return found; + } + + while (!feof(f_int_proc) && !found) { + /* Make sure to read a full line at a time */ + if (fgets(line, sizeof(line), f_int_proc) == NULL || + line[strlen(line) - 1] != '\n') { + printf("Error reading from interrupts file\n"); + break; + } + + /* Extract interrupt number from line */ + if (strstr(line, opt_irq_str) != NULL) { + irq_no = atoi(line); + found = true; + break; + } + } + + fclose(f_int_proc); + + return found; +} + +static int get_irqs(void) +{ + char count_path[PATH_MAX]; + int total_intrs = -1; + FILE *f_count_proc; + char line[4096]; + + snprintf(count_path, sizeof(count_path), + "/sys/kernel/irq/%i/per_cpu_count", irq_no); + f_count_proc = fopen(count_path, "r"); + if (f_count_proc == NULL) { + printf("Failed to open %s\n", count_path); + return total_intrs; + } + + if (fgets(line, sizeof(line), f_count_proc) == NULL || + line[strlen(line) - 1] != '\n') { + printf("Error reading from %s\n", count_path); + } else { + static const char com[2] = ","; + char *token; + + total_intrs = 0; + token = strtok(line, com); + while (token != NULL) { + /* sum up interrupts across all cores */ + total_intrs += atoi(token); + token = strtok(NULL, com); + } + } + + fclose(f_count_proc); + + return total_intrs; +} + +static void dump_driver_stats(long dt) +{ + int i; + + for (i = 0; i < num_socks && xsks[i]; i++) { + char *fmt = "%-18s %'-14.0f %'-14lu\n"; + double intrs_ps; + int n_ints = get_irqs(); + + if (n_ints < 0) { + printf("error getting intr info for intr %i\n", irq_no); + return; + } + xsks[i]->drv_stats.intrs = n_ints - irqs_at_init; + + intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) * + 1000000000. / dt; + + printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count"); + printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs); + + xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs; + } +} + static void dump_stats(void) { unsigned long now = get_nsecs(); @@ -194,67 +356,83 @@ static void dump_stats(void) prev_time = now; for (i = 0; i < num_socks && xsks[i]; i++) { - char *fmt = "%-15s %'-11.0f %'-11lu\n"; + char *fmt = "%-18s %'-14.0f %'-14lu\n"; double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, tx_invalid_pps, tx_empty_pps; - rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) * + rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) * 1000000000. / dt; - tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) * + tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) * 1000000000. / dt; printf("\n sock%d@", i); print_benchmark(false); printf("\n"); - printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", + printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts", dt / 1000000000.); - printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts); - printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts); + printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts); + printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts); - xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts; - xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts; + xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts; + xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts; if (opt_extra_stats) { if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) { - dropped_pps = (xsks[i]->rx_dropped_npkts - - xsks[i]->prev_rx_dropped_npkts) * 1000000000. / dt; - rx_invalid_pps = (xsks[i]->rx_invalid_npkts - - xsks[i]->prev_rx_invalid_npkts) * 1000000000. / dt; - tx_invalid_pps = (xsks[i]->tx_invalid_npkts - - xsks[i]->prev_tx_invalid_npkts) * 1000000000. / dt; - full_pps = (xsks[i]->rx_full_npkts - - xsks[i]->prev_rx_full_npkts) * 1000000000. / dt; - fill_empty_pps = (xsks[i]->rx_fill_empty_npkts - - xsks[i]->prev_rx_fill_empty_npkts) - * 1000000000. / dt; - tx_empty_pps = (xsks[i]->tx_empty_npkts - - xsks[i]->prev_tx_empty_npkts) * 1000000000. / dt; + dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts - + xsks[i]->ring_stats.prev_rx_dropped_npkts) * + 1000000000. / dt; + rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts - + xsks[i]->ring_stats.prev_rx_invalid_npkts) * + 1000000000. / dt; + tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts - + xsks[i]->ring_stats.prev_tx_invalid_npkts) * + 1000000000. / dt; + full_pps = (xsks[i]->ring_stats.rx_full_npkts - + xsks[i]->ring_stats.prev_rx_full_npkts) * + 1000000000. / dt; + fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts - + xsks[i]->ring_stats.prev_rx_fill_empty_npkts) * + 1000000000. / dt; + tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts - + xsks[i]->ring_stats.prev_tx_empty_npkts) * + 1000000000. / dt; printf(fmt, "rx dropped", dropped_pps, - xsks[i]->rx_dropped_npkts); + xsks[i]->ring_stats.rx_dropped_npkts); printf(fmt, "rx invalid", rx_invalid_pps, - xsks[i]->rx_invalid_npkts); + xsks[i]->ring_stats.rx_invalid_npkts); printf(fmt, "tx invalid", tx_invalid_pps, - xsks[i]->tx_invalid_npkts); + xsks[i]->ring_stats.tx_invalid_npkts); printf(fmt, "rx queue full", full_pps, - xsks[i]->rx_full_npkts); + xsks[i]->ring_stats.rx_full_npkts); printf(fmt, "fill ring empty", fill_empty_pps, - xsks[i]->rx_fill_empty_npkts); + xsks[i]->ring_stats.rx_fill_empty_npkts); printf(fmt, "tx ring empty", tx_empty_pps, - xsks[i]->tx_empty_npkts); - - xsks[i]->prev_rx_dropped_npkts = xsks[i]->rx_dropped_npkts; - xsks[i]->prev_rx_invalid_npkts = xsks[i]->rx_invalid_npkts; - xsks[i]->prev_tx_invalid_npkts = xsks[i]->tx_invalid_npkts; - xsks[i]->prev_rx_full_npkts = xsks[i]->rx_full_npkts; - xsks[i]->prev_rx_fill_empty_npkts = xsks[i]->rx_fill_empty_npkts; - xsks[i]->prev_tx_empty_npkts = xsks[i]->tx_empty_npkts; + xsks[i]->ring_stats.tx_empty_npkts); + + xsks[i]->ring_stats.prev_rx_dropped_npkts = + xsks[i]->ring_stats.rx_dropped_npkts; + xsks[i]->ring_stats.prev_rx_invalid_npkts = + xsks[i]->ring_stats.rx_invalid_npkts; + xsks[i]->ring_stats.prev_tx_invalid_npkts = + xsks[i]->ring_stats.tx_invalid_npkts; + xsks[i]->ring_stats.prev_rx_full_npkts = + xsks[i]->ring_stats.rx_full_npkts; + xsks[i]->ring_stats.prev_rx_fill_empty_npkts = + xsks[i]->ring_stats.rx_fill_empty_npkts; + xsks[i]->ring_stats.prev_tx_empty_npkts = + xsks[i]->ring_stats.tx_empty_npkts; } else { printf("%-15s\n", "Error retrieving extra stats"); } } } + + if (opt_app_stats) + dump_app_stats(dt); + if (irq_no) + dump_driver_stats(dt); } static bool is_benchmark_done(void) @@ -693,6 +871,17 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, if (ret) exit_with_error(-ret); + xsk->app_stats.rx_empty_polls = 0; + xsk->app_stats.fill_fail_polls = 0; + xsk->app_stats.copy_tx_sendtos = 0; + xsk->app_stats.tx_wakeup_sendtos = 0; + xsk->app_stats.opt_polls = 0; + xsk->app_stats.prev_rx_empty_polls = 0; + xsk->app_stats.prev_fill_fail_polls = 0; + xsk->app_stats.prev_copy_tx_sendtos = 0; + xsk->app_stats.prev_tx_wakeup_sendtos = 0; + xsk->app_stats.prev_opt_polls = 0; + return xsk; } @@ -720,6 +909,8 @@ static struct option long_options[] = { {"tx-pkt-pattern", required_argument, 0, 'P'}, {"extra-stats", no_argument, 0, 'x'}, {"quiet", no_argument, 0, 'Q'}, + {"app-stats", no_argument, 0, 'a'}, + {"irq-string", no_argument, 0, 'I'}, {0, 0, 0, 0} }; @@ -756,6 +947,8 @@ static void usage(const char *prog) " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" " -x, --extra-stats Display extra statistics.\n" " -Q, --quiet Do not display any stats.\n" + " -a, --app-stats Display application (syscall) statistics.\n" + " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, @@ -771,7 +964,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:", long_options, &option_index); if (c == -1) break; @@ -858,6 +1051,19 @@ static void parse_command_line(int argc, char **argv) case 'Q': opt_quiet = 1; break; + case 'a': + opt_app_stats = 1; + break; + case 'I': + opt_irq_str = optarg; + if (get_interrupt_number()) + irqs_at_init = get_irqs(); + if (irqs_at_init < 0) { + fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); + usage(basename(argv[0])); + } + + break; default: usage(basename(argv[0])); } @@ -908,8 +1114,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, * is driven by the NAPI loop. So as an optimization, we do not have to call * sendto() all the time in zero-copy mode for l2fwd. */ - if (opt_xdp_bind_flags & XDP_COPY) + if (opt_xdp_bind_flags & XDP_COPY) { + xsk->app_stats.copy_tx_sendtos++; kick_tx(xsk); + } ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : xsk->outstanding_tx; @@ -924,8 +1132,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&umem->fq)) + if (xsk_ring_prod__needs_wakeup(&umem->fq)) { + xsk->app_stats.fill_fail_polls++; ret = poll(fds, num_socks, opt_timeout); + } ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); } @@ -936,7 +1146,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->tx_npkts += rcvd; + xsk->ring_stats.tx_npkts += rcvd; } } @@ -949,14 +1159,16 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, if (!xsk->outstanding_tx) return; - if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) + if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) { + xsk->app_stats.tx_wakeup_sendtos++; kick_tx(xsk); + } rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); if (rcvd > 0) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->tx_npkts += rcvd; + xsk->ring_stats.tx_npkts += rcvd; } } @@ -968,8 +1180,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.rx_empty_polls++; ret = poll(fds, num_socks, opt_timeout); + } return; } @@ -977,8 +1191,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.fill_fail_polls++; ret = poll(fds, num_socks, opt_timeout); + } ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } @@ -996,7 +1212,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->rx_npkts += rcvd; + xsk->ring_stats.rx_npkts += rcvd; } static void rx_drop_all(void) @@ -1011,6 +1227,8 @@ static void rx_drop_all(void) for (;;) { if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; @@ -1091,6 +1309,8 @@ static void tx_only_all(void) int batch_size = get_batch_size(pkt_cnt); if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; @@ -1122,8 +1342,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.rx_empty_polls++; ret = poll(fds, num_socks, opt_timeout); + } return; } @@ -1132,8 +1354,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) if (ret < 0) exit_with_error(-ret); complete_tx_l2fwd(xsk, fds); - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + xsk->app_stats.tx_wakeup_sendtos++; kick_tx(xsk); + } ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); } @@ -1155,7 +1379,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) xsk_ring_prod__submit(&xsk->tx, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->rx_npkts += rcvd; + xsk->ring_stats.rx_npkts += rcvd; xsk->outstanding_tx += rcvd; } @@ -1171,6 +1395,8 @@ static void l2fwd_all(void) for (;;) { if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4f556cfcbfbe..bf5a99d803e4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -356,18 +356,36 @@ enum bpf_link_type { #define BPF_F_SLEEPABLE (1U << 4) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD + * insn[0].imm: map fd + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 +/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_VALUE 2 +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -417,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ @@ -1680,7 +1701,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return @@ -2235,7 +2256,7 @@ union bpf_attr { * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it + * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The @@ -3661,10 +3682,59 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. @@ -3823,6 +3893,9 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ + FN(bpf_per_cpu_ptr), \ + FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3df1f4db9f3a..313034117070 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -390,6 +390,12 @@ struct extern_desc { } kcfg; struct { unsigned long long addr; + + /* target btf_id of the corresponding kernel var. */ + int vmlinux_btf_id; + + /* local btf_id of the ksym extern's type. */ + __u32 type_id; } ksym; }; }; @@ -2522,12 +2528,23 @@ static int bpf_object__load_vmlinux_btf(struct bpf_object *obj) { bool need_vmlinux_btf = false; struct bpf_program *prog; - int err; + int i, err; /* CO-RE relocations need kernel BTF */ if (obj->btf_ext && obj->btf_ext->core_relo_info.len) need_vmlinux_btf = true; + /* Support for typed ksyms needs kernel BTF */ + for (i = 0; i < obj->nr_extern; i++) { + const struct extern_desc *ext; + + ext = &obj->externs[i]; + if (ext->type == EXT_KSYM && ext->ksym.type_id) { + need_vmlinux_btf = true; + break; + } + } + bpf_object__for_each_program(prog, obj) { if (!prog->load) continue; @@ -3156,16 +3173,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return -ENOTSUP; } } else if (strcmp(sec_name, KSYMS_SEC) == 0) { - const struct btf_type *vt; - ksym_sec = sec; ext->type = EXT_KSYM; - - vt = skip_mods_and_typedefs(obj->btf, t->type, NULL); - if (!btf_is_void(vt)) { - pr_warn("extern (ksym) '%s' is not typeless (void)\n", ext_name); - return -ENOTSUP; - } + skip_mods_and_typedefs(obj->btf, t->type, + &ext->ksym.type_id); } else { pr_warn("unrecognized extern section '%s'\n", sec_name); return -ENOTSUP; @@ -4192,6 +4203,36 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) return 0; } +static int init_map_slots(struct bpf_map *map) +{ + const struct bpf_map *targ_map; + unsigned int i; + int fd, err; + + for (i = 0; i < map->init_slots_sz; i++) { + if (!map->init_slots[i]) + continue; + + targ_map = map->init_slots[i]; + fd = bpf_map__fd(targ_map); + err = bpf_map_update_elem(map->fd, &i, &fd, 0); + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", + map->name, i, targ_map->name, + fd, err); + return err; + } + pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", + map->name, i, targ_map->name, fd); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + return 0; +} + static int bpf_object__create_maps(struct bpf_object *obj) { @@ -4215,47 +4256,29 @@ bpf_object__create_maps(struct bpf_object *obj) if (map->fd >= 0) { pr_debug("map '%s': skipping creation (preset fd=%d)\n", map->name, map->fd); - continue; - } - - err = bpf_object__create_map(obj, map); - if (err) - goto err_out; - - pr_debug("map '%s': created successfully, fd=%d\n", map->name, - map->fd); - - if (bpf_map__is_internal(map)) { - err = bpf_object__populate_internal_map(obj, map); - if (err < 0) { - zclose(map->fd); + } else { + err = bpf_object__create_map(obj, map); + if (err) goto err_out; - } - } - if (map->init_slots_sz) { - for (j = 0; j < map->init_slots_sz; j++) { - const struct bpf_map *targ_map; - int fd; + pr_debug("map '%s': created successfully, fd=%d\n", + map->name, map->fd); - if (!map->init_slots[j]) - continue; + if (bpf_map__is_internal(map)) { + err = bpf_object__populate_internal_map(obj, map); + if (err < 0) { + zclose(map->fd); + goto err_out; + } + } - targ_map = map->init_slots[j]; - fd = bpf_map__fd(targ_map); - err = bpf_map_update_elem(map->fd, &j, &fd, 0); - if (err) { - err = -errno; - pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", - map->name, j, targ_map->name, - fd, err); + if (map->init_slots_sz) { + err = init_map_slots(map); + if (err < 0) { + zclose(map->fd); goto err_out; } - pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", - map->name, j, targ_map->name, fd); } - zfree(&map->init_slots); - map->init_slots_sz = 0; } if (map->pin_path && !map->pinned) { @@ -5017,16 +5040,19 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, static int bpf_core_calc_field_relo(const struct bpf_program *prog, const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, - __u32 *val, bool *validate) + __u32 *val, __u32 *field_sz, __u32 *type_id, + bool *validate) { const struct bpf_core_accessor *acc; const struct btf_type *t; - __u32 byte_off, byte_sz, bit_off, bit_sz; + __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id; const struct btf_member *m; const struct btf_type *mt; bool bitfield; __s64 sz; + *field_sz = 0; + if (relo->kind == BPF_FIELD_EXISTS) { *val = spec ? 1 : 0; return 0; @@ -5042,6 +5068,12 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog, if (!acc->name) { if (relo->kind == BPF_FIELD_BYTE_OFFSET) { *val = spec->bit_offset / 8; + /* remember field size for load/store mem size */ + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *field_sz = sz; + *type_id = acc->type_id; } else if (relo->kind == BPF_FIELD_BYTE_SIZE) { sz = btf__resolve_size(spec->btf, acc->type_id); if (sz < 0) @@ -5058,7 +5090,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog, } m = btf_members(t) + acc->idx; - mt = skip_mods_and_typedefs(spec->btf, m->type, NULL); + mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id); bit_off = spec->bit_offset; bit_sz = btf_member_bitfield_size(t, acc->idx); @@ -5078,7 +5110,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog, byte_off = bit_off / 8 / byte_sz * byte_sz; } } else { - sz = btf__resolve_size(spec->btf, m->type); + sz = btf__resolve_size(spec->btf, field_type_id); if (sz < 0) return -EINVAL; byte_sz = sz; @@ -5096,6 +5128,10 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog, switch (relo->kind) { case BPF_FIELD_BYTE_OFFSET: *val = byte_off; + if (!bitfield) { + *field_sz = byte_sz; + *type_id = field_type_id; + } break; case BPF_FIELD_BYTE_SIZE: *val = byte_sz; @@ -5196,6 +5232,19 @@ struct bpf_core_relo_res bool poison; /* some relocations can't be validated against orig_val */ bool validate; + /* for field byte offset relocations or the forms: + * *(T *)(rX + <off>) = rY + * rX = *(T *)(rY + <off>), + * we remember original and resolved field size to adjust direct + * memory loads of pointers and integers; this is necessary for 32-bit + * host kernel architectures, but also allows to automatically + * relocate fields that were resized from, e.g., u32 to u64, etc. + */ + bool fail_memsz_adjust; + __u32 orig_sz; + __u32 orig_type_id; + __u32 new_sz; + __u32 new_type_id; }; /* Calculate original and target relocation values, given local and target @@ -5217,10 +5266,56 @@ static int bpf_core_calc_relo(const struct bpf_program *prog, res->new_val = 0; res->poison = false; res->validate = true; + res->fail_memsz_adjust = false; + res->orig_sz = res->new_sz = 0; + res->orig_type_id = res->new_type_id = 0; if (core_relo_is_field_based(relo->kind)) { - err = bpf_core_calc_field_relo(prog, relo, local_spec, &res->orig_val, &res->validate); - err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec, &res->new_val, NULL); + err = bpf_core_calc_field_relo(prog, relo, local_spec, + &res->orig_val, &res->orig_sz, + &res->orig_type_id, &res->validate); + err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec, + &res->new_val, &res->new_sz, + &res->new_type_id, NULL); + if (err) + goto done; + /* Validate if it's safe to adjust load/store memory size. + * Adjustments are performed only if original and new memory + * sizes differ. + */ + res->fail_memsz_adjust = false; + if (res->orig_sz != res->new_sz) { + const struct btf_type *orig_t, *new_t; + + orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id); + new_t = btf__type_by_id(targ_spec->btf, res->new_type_id); + + /* There are two use cases in which it's safe to + * adjust load/store's mem size: + * - reading a 32-bit kernel pointer, while on BPF + * size pointers are always 64-bit; in this case + * it's safe to "downsize" instruction size due to + * pointer being treated as unsigned integer with + * zero-extended upper 32-bits; + * - reading unsigned integers, again due to + * zero-extension is preserving the value correctly. + * + * In all other cases it's incorrect to attempt to + * load/store field because read value will be + * incorrect, so we poison relocated instruction. + */ + if (btf_is_ptr(orig_t) && btf_is_ptr(new_t)) + goto done; + if (btf_is_int(orig_t) && btf_is_int(new_t) && + btf_int_encoding(orig_t) != BTF_INT_SIGNED && + btf_int_encoding(new_t) != BTF_INT_SIGNED) + goto done; + + /* mark as invalid mem size adjustment, but this will + * only be checked for LDX/STX/ST insns + */ + res->fail_memsz_adjust = true; + } } else if (core_relo_is_type_based(relo->kind)) { err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val); err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val); @@ -5229,6 +5324,7 @@ static int bpf_core_calc_relo(const struct bpf_program *prog, err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val); } +done: if (err == -EUCLEAN) { /* EUCLEAN is used to signal instruction poisoning request */ res->poison = true; @@ -5268,6 +5364,28 @@ static bool is_ldimm64(struct bpf_insn *insn) return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } +static int insn_bpf_size_to_bytes(struct bpf_insn *insn) +{ + switch (BPF_SIZE(insn->code)) { + case BPF_DW: return 8; + case BPF_W: return 4; + case BPF_H: return 2; + case BPF_B: return 1; + default: return -1; + } +} + +static int insn_bytes_to_bpf_size(__u32 sz) +{ + switch (sz) { + case 8: return BPF_DW; + case 4: return BPF_W; + case 2: return BPF_H; + case 1: return BPF_B; + default: return -1; + } +} + /* * Patch relocatable BPF instruction. * @@ -5277,10 +5395,13 @@ static bool is_ldimm64(struct bpf_insn *insn) * spec, and is checked before patching instruction. If actual insn->imm value * is wrong, bail out with error. * - * Currently three kinds of BPF instructions are supported: + * Currently supported classes of BPF instruction are: * 1. rX = <imm> (assignment with immediate operand); * 2. rX += <imm> (arithmetic operations with immediate operand); - * 3. rX = <imm64> (load with 64-bit immediate value). + * 3. rX = <imm64> (load with 64-bit immediate value); + * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64}; + * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64}; + * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}. */ static int bpf_core_patch_insn(struct bpf_program *prog, const struct bpf_core_relo *relo, @@ -5304,6 +5425,7 @@ static int bpf_core_patch_insn(struct bpf_program *prog, class = BPF_CLASS(insn->code); if (res->poison) { +poison: /* poison second part of ldimm64 to avoid confusing error from * verifier about "unknown opcode 00" */ @@ -5346,10 +5468,39 @@ static int bpf_core_patch_insn(struct bpf_program *prog, prog->name, relo_idx, insn_idx, new_val); return -ERANGE; } + if (res->fail_memsz_adjust) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. " + "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n", + prog->name, relo_idx, insn_idx); + goto poison; + } + orig_val = insn->off; insn->off = new_val; pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n", prog->name, relo_idx, insn_idx, orig_val, new_val); + + if (res->new_sz != res->orig_sz) { + int insn_bytes_sz, insn_bpf_sz; + + insn_bytes_sz = insn_bpf_size_to_bytes(insn); + if (insn_bytes_sz != res->orig_sz) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n", + prog->name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz); + return -EINVAL; + } + + insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz); + if (insn_bpf_sz < 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n", + prog->name, relo_idx, insn_idx, res->new_sz); + return -EINVAL; + } + + insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code); + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n", + prog->name, relo_idx, insn_idx, res->orig_sz, res->new_sz); + } break; case BPF_LD: { __u64 imm; @@ -5691,7 +5842,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) return 0; if (targ_btf_path) - targ_btf = btf__parse_elf(targ_btf_path, NULL); + targ_btf = btf__parse(targ_btf_path, NULL); else targ_btf = obj->btf_vmlinux; if (IS_ERR_OR_NULL(targ_btf)) { @@ -5742,6 +5893,11 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) err = -EINVAL; goto out; } + /* no need to apply CO-RE relocation if the program is + * not going to be loaded + */ + if (!prog->load) + continue; err = bpf_core_apply_relo(prog, rec, i, obj->btf, targ_btf, cand_cache); @@ -5800,8 +5956,13 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) insn[0].imm = obj->maps[obj->kconfig_map_idx].fd; insn[1].imm = ext->kcfg.data_off; } else /* EXT_KSYM */ { - insn[0].imm = (__u32)ext->ksym.addr; - insn[1].imm = ext->ksym.addr >> 32; + if (ext->ksym.type_id) { /* typed ksyms */ + insn[0].src_reg = BPF_PSEUDO_BTF_ID; + insn[0].imm = ext->ksym.vmlinux_btf_id; + } else { /* typeless ksyms */ + insn[0].imm = (__u32)ext->ksym.addr; + insn[1].imm = ext->ksym.addr >> 32; + } } relo->processed = true; break; @@ -6933,10 +7094,72 @@ out: return err; } +static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +{ + struct extern_desc *ext; + int i, id; + + for (i = 0; i < obj->nr_extern; i++) { + const struct btf_type *targ_var, *targ_type; + __u32 targ_type_id, local_type_id; + const char *targ_var_name; + int ret; + + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM || !ext->ksym.type_id) + continue; + + id = btf__find_by_name_kind(obj->btf_vmlinux, ext->name, + BTF_KIND_VAR); + if (id <= 0) { + pr_warn("extern (ksym) '%s': failed to find BTF ID in vmlinux BTF.\n", + ext->name); + return -ESRCH; + } + + /* find local type_id */ + local_type_id = ext->ksym.type_id; + + /* find target type_id */ + targ_var = btf__type_by_id(obj->btf_vmlinux, id); + targ_var_name = btf__name_by_offset(obj->btf_vmlinux, + targ_var->name_off); + targ_type = skip_mods_and_typedefs(obj->btf_vmlinux, + targ_var->type, + &targ_type_id); + + ret = bpf_core_types_are_compat(obj->btf, local_type_id, + obj->btf_vmlinux, targ_type_id); + if (ret <= 0) { + const struct btf_type *local_type; + const char *targ_name, *local_name; + + local_type = btf__type_by_id(obj->btf, local_type_id); + local_name = btf__name_by_offset(obj->btf, + local_type->name_off); + targ_name = btf__name_by_offset(obj->btf_vmlinux, + targ_type->name_off); + + pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", + ext->name, local_type_id, + btf_kind_str(local_type), local_name, targ_type_id, + btf_kind_str(targ_type), targ_name); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.vmlinux_btf_id = id; + pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n", + ext->name, id, btf_kind_str(targ_var), targ_var_name); + } + return 0; +} + static int bpf_object__resolve_externs(struct bpf_object *obj, const char *extra_kconfig) { bool need_config = false, need_kallsyms = false; + bool need_vmlinux_btf = false; struct extern_desc *ext; void *kcfg_data = NULL; int err, i; @@ -6967,7 +7190,10 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, strncmp(ext->name, "CONFIG_", 7) == 0) { need_config = true; } else if (ext->type == EXT_KSYM) { - need_kallsyms = true; + if (ext->ksym.type_id) + need_vmlinux_btf = true; + else + need_kallsyms = true; } else { pr_warn("unrecognized extern '%s'\n", ext->name); return -EINVAL; @@ -6996,6 +7222,11 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, if (err) return -EINVAL; } + if (need_vmlinux_btf) { + err = bpf_object__resolve_ksyms_btf_id(obj); + if (err) + return -EINVAL; + } for (i = 0; i < obj->nr_extern; i++) { ext = &obj->externs[i]; @@ -7028,10 +7259,10 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) } err = bpf_object__probe_loading(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj); err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); - err = err ? : bpf_object__load_vmlinux_btf(obj); err = err ? : bpf_object__init_kern_struct_ops_maps(obj); err = err ? : bpf_object__create_maps(obj); err = err ? : bpf_object__relocate(obj, attr->target_btf_path); @@ -10353,9 +10584,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog, btf_id = libbpf_find_prog_btf_id(attach_func_name, attach_prog_fd); else - btf_id = __find_vmlinux_btf_id(prog->obj->btf_vmlinux, - attach_func_name, - prog->expected_attach_type); + btf_id = libbpf_find_vmlinux_btf_id(attach_func_name, + prog->expected_attach_type); if (btf_id < 0) return btf_id; diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 30b4ca5d2ac7..e3c98c007825 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -705,7 +705,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, struct xsk_ctx *ctx; int err, ifindex; - if (!umem || !xsk_ptr || !(rx || tx) || !fill || !comp) + if (!umem || !xsk_ptr || !(rx || tx)) return -EFAULT; xsk = calloc(1, sizeof(*xsk)); @@ -735,6 +735,11 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, ctx = xsk_get_ctx(umem, ifindex, queue_id); if (!ctx) { + if (!fill || !comp) { + err = -EFAULT; + goto out_socket; + } + ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id, fill, comp); if (!ctx) { diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 66acfcf15ff2..ac9eda830187 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -7,6 +7,44 @@ General instructions on running selftests can be found in Additional information about selftest failures are documented here. +profiler[23] test failures with clang/llvm <12.0.0 +================================================== + +With clang/llvm <12.0.0, the profiler[23] test may fail. +The symptom looks like + +.. code-block:: c + + // r9 is a pointer to map_value + // r7 is a scalar + 17: bf 96 00 00 00 00 00 00 r6 = r9 + 18: 0f 76 00 00 00 00 00 00 r6 += r7 + math between map_value pointer and register with unbounded min value is not allowed + + // the instructions below will not be seen in the verifier log + 19: a5 07 01 00 01 01 00 00 if r7 < 257 goto +1 + 20: bf 96 00 00 00 00 00 00 r6 = r9 + // r6 is used here + +The verifier will reject such code with above error. +At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and +the insn 20 undoes map_value addition. It is currently impossible for the +verifier to understand such speculative pointer arithmetic. +Hence + https://reviews.llvm.org/D85570 +addresses it on the compiler side. It was committed on llvm 12. + +The corresponding C code +.. code-block:: c + + for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) { + filepart_length = bpf_probe_read_str(payload, ...); + if (filepart_length <= MAX_PATH) { + barrier_var(filepart_length); // workaround + payload += filepart_length; + } + } + bpf_iter test failures with clang/llvm 10.0.0 ============================================= diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index c548aded6585..52414058a627 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -195,13 +195,13 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {8, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {10, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, - {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {12, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {14, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, }, @@ -518,7 +518,7 @@ static struct bpf_align_test tests[] = { * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, + {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, }, }, @@ -561,18 +561,18 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, + {13, "R5_w=pkt(id=2,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, + {16, "R5_w=pkt(id=3,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, + {20, "R5=pkt(id=3,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, }, }, }; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c index 540fea4c91a5..76ebe4c250f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -55,10 +55,10 @@ static int kern_sync_rcu(void) static void test_lookup_update(void) { - int err, key = 0, val, i; + int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id; + int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd; struct test_btf_map_in_map *skel; - int outer_arr_fd, outer_hash_fd; - int fd, map1_fd, map2_fd, map1_id, map2_id; + int err, key = 0, val, i, fd; skel = test_btf_map_in_map__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) @@ -70,32 +70,45 @@ static void test_lookup_update(void) map1_fd = bpf_map__fd(skel->maps.inner_map1); map2_fd = bpf_map__fd(skel->maps.inner_map2); + map3_fd = bpf_map__fd(skel->maps.inner_map3); + map4_fd = bpf_map__fd(skel->maps.inner_map4); + map5_fd = bpf_map__fd(skel->maps.inner_map5); + outer_arr_dyn_fd = bpf_map__fd(skel->maps.outer_arr_dyn); outer_arr_fd = bpf_map__fd(skel->maps.outer_arr); outer_hash_fd = bpf_map__fd(skel->maps.outer_hash); - /* inner1 = input, inner2 = input + 1 */ - map1_fd = bpf_map__fd(skel->maps.inner_map1); + /* inner1 = input, inner2 = input + 1, inner3 = input + 2 */ bpf_map_update_elem(outer_arr_fd, &key, &map1_fd, 0); - map2_fd = bpf_map__fd(skel->maps.inner_map2); bpf_map_update_elem(outer_hash_fd, &key, &map2_fd, 0); + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map3_fd, 0); skel->bss->input = 1; usleep(1); - bpf_map_lookup_elem(map1_fd, &key, &val); CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1); bpf_map_lookup_elem(map2_fd, &key, &val); CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2); + bpf_map_lookup_elem(map3_fd, &key, &val); + CHECK(val != 3, "inner3", "got %d != exp %d\n", val, 3); - /* inner1 = input + 1, inner2 = input */ + /* inner2 = input, inner1 = input + 1, inner4 = input + 2 */ bpf_map_update_elem(outer_arr_fd, &key, &map2_fd, 0); bpf_map_update_elem(outer_hash_fd, &key, &map1_fd, 0); + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map4_fd, 0); skel->bss->input = 3; usleep(1); - bpf_map_lookup_elem(map1_fd, &key, &val); CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4); bpf_map_lookup_elem(map2_fd, &key, &val); CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3); + bpf_map_lookup_elem(map4_fd, &key, &val); + CHECK(val != 5, "inner4", "got %d != exp %d\n", val, 5); + + /* inner5 = input + 2 */ + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map5_fd, 0); + skel->bss->input = 5; + usleep(1); + bpf_map_lookup_elem(map5_fd, &key, &val); + CHECK(val != 7, "inner5", "got %d != exp %d\n", val, 7); for (i = 0; i < 5; i++) { val = i % 2 ? map1_fd : map2_fd; @@ -106,7 +119,13 @@ static void test_lookup_update(void) } err = bpf_map_update_elem(outer_arr_fd, &key, &val, 0); if (CHECK_FAIL(err)) { - printf("failed to update hash_of_maps on iter #%d\n", i); + printf("failed to update array_of_maps on iter #%d\n", i); + goto cleanup; + } + val = i % 2 ? map4_fd : map5_fd; + err = bpf_map_update_elem(outer_arr_dyn_fd, &key, &val, 0); + if (CHECK_FAIL(err)) { + printf("failed to update array_of_maps (dyn) on iter #%d\n", i); goto cleanup; } } diff --git a/tools/testing/selftests/bpf/prog_tests/core_autosize.c b/tools/testing/selftests/bpf/prog_tests/core_autosize.c new file mode 100644 index 000000000000..981c251453d9 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/core_autosize.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <test_progs.h> +#include <bpf/btf.h> + +/* real layout and sizes according to test's (32-bit) BTF + * needs to be defined before skeleton is included */ +struct test_struct___real { + unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */ + unsigned int val2; + unsigned long long val1; + unsigned short val3; + unsigned char val4; + unsigned char _pad; +}; + +#include "test_core_autosize.skel.h" + +static int duration = 0; + +static struct { + unsigned long long ptr_samesized; + unsigned long long val1_samesized; + unsigned long long val2_samesized; + unsigned long long val3_samesized; + unsigned long long val4_samesized; + struct test_struct___real output_samesized; + + unsigned long long ptr_downsized; + unsigned long long val1_downsized; + unsigned long long val2_downsized; + unsigned long long val3_downsized; + unsigned long long val4_downsized; + struct test_struct___real output_downsized; + + unsigned long long ptr_probed; + unsigned long long val1_probed; + unsigned long long val2_probed; + unsigned long long val3_probed; + unsigned long long val4_probed; + + unsigned long long ptr_signed; + unsigned long long val1_signed; + unsigned long long val2_signed; + unsigned long long val3_signed; + unsigned long long val4_signed; + struct test_struct___real output_signed; +} out; + +void test_core_autosize(void) +{ + char btf_file[] = "/tmp/core_autosize.btf.XXXXXX"; + int err, fd = -1, zero = 0; + int char_id, short_id, int_id, long_long_id, void_ptr_id, id; + struct test_core_autosize* skel = NULL; + struct bpf_object_load_attr load_attr = {}; + struct bpf_program *prog; + struct bpf_map *bss_map; + struct btf *btf = NULL; + size_t written; + const void *raw_data; + __u32 raw_sz; + FILE *f = NULL; + + btf = btf__new_empty(); + if (!ASSERT_OK_PTR(btf, "empty_btf")) + return; + /* Emit the following struct with 32-bit pointer size: + * + * struct test_struct { + * void *ptr; + * unsigned long val2; + * unsigned long long val1; + * unsigned short val3; + * unsigned char val4; + * char: 8; + * }; + * + * This struct is going to be used as the "kernel BTF" for this test. + * It's equivalent memory-layout-wise to test_struct__real above. + */ + + /* force 32-bit pointer size */ + btf__set_pointer_size(btf, 4); + + char_id = btf__add_int(btf, "unsigned char", 1, 0); + ASSERT_EQ(char_id, 1, "char_id"); + short_id = btf__add_int(btf, "unsigned short", 2, 0); + ASSERT_EQ(short_id, 2, "short_id"); + /* "long unsigned int" of 4 byte size tells BTF that sizeof(void *) == 4 */ + int_id = btf__add_int(btf, "long unsigned int", 4, 0); + ASSERT_EQ(int_id, 3, "int_id"); + long_long_id = btf__add_int(btf, "unsigned long long", 8, 0); + ASSERT_EQ(long_long_id, 4, "long_long_id"); + void_ptr_id = btf__add_ptr(btf, 0); + ASSERT_EQ(void_ptr_id, 5, "void_ptr_id"); + + id = btf__add_struct(btf, "test_struct", 20 /* bytes */); + ASSERT_EQ(id, 6, "struct_id"); + err = btf__add_field(btf, "ptr", void_ptr_id, 0, 0); + err = err ?: btf__add_field(btf, "val2", int_id, 32, 0); + err = err ?: btf__add_field(btf, "val1", long_long_id, 64, 0); + err = err ?: btf__add_field(btf, "val3", short_id, 128, 0); + err = err ?: btf__add_field(btf, "val4", char_id, 144, 0); + ASSERT_OK(err, "struct_fields"); + + fd = mkstemp(btf_file); + if (CHECK(fd < 0, "btf_tmp", "failed to create file: %d\n", fd)) + goto cleanup; + f = fdopen(fd, "w"); + if (!ASSERT_OK_PTR(f, "btf_fdopen")) + goto cleanup; + + raw_data = btf__get_raw_data(btf, &raw_sz); + if (!ASSERT_OK_PTR(raw_data, "raw_data")) + goto cleanup; + written = fwrite(raw_data, 1, raw_sz, f); + if (CHECK(written != raw_sz, "btf_write", "written: %zu, errno: %d\n", written, errno)) + goto cleanup; + fflush(f); + fclose(f); + f = NULL; + close(fd); + fd = -1; + + /* open and load BPF program with custom BTF as the kernel BTF */ + skel = test_core_autosize__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + /* disable handle_signed() for now */ + prog = bpf_object__find_program_by_name(skel->obj, "handle_signed"); + if (!ASSERT_OK_PTR(prog, "prog_find")) + goto cleanup; + bpf_program__set_autoload(prog, false); + + load_attr.obj = skel->obj; + load_attr.target_btf_path = btf_file; + err = bpf_object__load_xattr(&load_attr); + if (!ASSERT_OK(err, "prog_load")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, "handle_samesize"); + if (!ASSERT_OK_PTR(prog, "prog_find")) + goto cleanup; + skel->links.handle_samesize = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(skel->links.handle_samesize, "prog_attach")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, "handle_downsize"); + if (!ASSERT_OK_PTR(prog, "prog_find")) + goto cleanup; + skel->links.handle_downsize = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(skel->links.handle_downsize, "prog_attach")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, "handle_probed"); + if (!ASSERT_OK_PTR(prog, "prog_find")) + goto cleanup; + skel->links.handle_probed = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(skel->links.handle_probed, "prog_attach")) + goto cleanup; + + usleep(1); + + bss_map = bpf_object__find_map_by_name(skel->obj, "test_cor.bss"); + if (!ASSERT_OK_PTR(bss_map, "bss_map_find")) + goto cleanup; + + err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &zero, (void *)&out); + if (!ASSERT_OK(err, "bss_lookup")) + goto cleanup; + + ASSERT_EQ(out.ptr_samesized, 0x01020304, "ptr_samesized"); + ASSERT_EQ(out.val1_samesized, 0x1020304050607080, "val1_samesized"); + ASSERT_EQ(out.val2_samesized, 0x0a0b0c0d, "val2_samesized"); + ASSERT_EQ(out.val3_samesized, 0xfeed, "val3_samesized"); + ASSERT_EQ(out.val4_samesized, 0xb9, "val4_samesized"); + ASSERT_EQ(out.output_samesized.ptr, 0x01020304, "ptr_samesized"); + ASSERT_EQ(out.output_samesized.val1, 0x1020304050607080, "val1_samesized"); + ASSERT_EQ(out.output_samesized.val2, 0x0a0b0c0d, "val2_samesized"); + ASSERT_EQ(out.output_samesized.val3, 0xfeed, "val3_samesized"); + ASSERT_EQ(out.output_samesized.val4, 0xb9, "val4_samesized"); + + ASSERT_EQ(out.ptr_downsized, 0x01020304, "ptr_downsized"); + ASSERT_EQ(out.val1_downsized, 0x1020304050607080, "val1_downsized"); + ASSERT_EQ(out.val2_downsized, 0x0a0b0c0d, "val2_downsized"); + ASSERT_EQ(out.val3_downsized, 0xfeed, "val3_downsized"); + ASSERT_EQ(out.val4_downsized, 0xb9, "val4_downsized"); + ASSERT_EQ(out.output_downsized.ptr, 0x01020304, "ptr_downsized"); + ASSERT_EQ(out.output_downsized.val1, 0x1020304050607080, "val1_downsized"); + ASSERT_EQ(out.output_downsized.val2, 0x0a0b0c0d, "val2_downsized"); + ASSERT_EQ(out.output_downsized.val3, 0xfeed, "val3_downsized"); + ASSERT_EQ(out.output_downsized.val4, 0xb9, "val4_downsized"); + + ASSERT_EQ(out.ptr_probed, 0x01020304, "ptr_probed"); + ASSERT_EQ(out.val1_probed, 0x1020304050607080, "val1_probed"); + ASSERT_EQ(out.val2_probed, 0x0a0b0c0d, "val2_probed"); + ASSERT_EQ(out.val3_probed, 0xfeed, "val3_probed"); + ASSERT_EQ(out.val4_probed, 0xb9, "val4_probed"); + + test_core_autosize__destroy(skel); + skel = NULL; + + /* now re-load with handle_signed() enabled, it should fail loading */ + skel = test_core_autosize__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + load_attr.obj = skel->obj; + load_attr.target_btf_path = btf_file; + err = bpf_object__load_xattr(&load_attr); + if (!ASSERT_ERR(err, "bad_prog_load")) + goto cleanup; + +cleanup: + if (f) + fclose(f); + if (fd >= 0) + close(fd); + remove(btf_file); + btf__free(btf); + test_core_autosize__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index b771804b2342..b295969b263b 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -7,40 +7,28 @@ static int duration; -static __u64 kallsyms_find(const char *sym) -{ - char type, name[500]; - __u64 addr, res = 0; - FILE *f; - - f = fopen("/proc/kallsyms", "r"); - if (CHECK(!f, "kallsyms_fopen", "failed to open: %d\n", errno)) - return 0; - - while (fscanf(f, "%llx %c %499s%*[^\n]\n", &addr, &type, name) > 0) { - if (strcmp(name, sym) == 0) { - res = addr; - goto out; - } - } - - CHECK(false, "not_found", "symbol %s not found\n", sym); -out: - fclose(f); - return res; -} - void test_ksyms(void) { - __u64 per_cpu_start_addr = kallsyms_find("__per_cpu_start"); - __u64 link_fops_addr = kallsyms_find("bpf_link_fops"); const char *btf_path = "/sys/kernel/btf/vmlinux"; struct test_ksyms *skel; struct test_ksyms__data *data; + __u64 link_fops_addr, per_cpu_start_addr; struct stat st; __u64 btf_size; int err; + err = kallsyms_find("bpf_link_fops", &link_fops_addr); + if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + return; + if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n")) + return; + + err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr); + if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + return; + if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n")) + return; + if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno)) return; btf_size = st.st_size; diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c new file mode 100644 index 000000000000..28e26bd3e0ca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Google */ + +#include <test_progs.h> +#include <bpf/libbpf.h> +#include <bpf/btf.h> +#include "test_ksyms_btf.skel.h" + +static int duration; + +void test_ksyms_btf(void) +{ + __u64 runqueues_addr, bpf_prog_active_addr; + __u32 this_rq_cpu; + int this_bpf_prog_active; + struct test_ksyms_btf *skel = NULL; + struct test_ksyms_btf__data *data; + struct btf *btf; + int percpu_datasec; + int err; + + err = kallsyms_find("runqueues", &runqueues_addr); + if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + return; + if (CHECK(err == -ENOENT, "ksym_find", "symbol 'runqueues' not found\n")) + return; + + err = kallsyms_find("bpf_prog_active", &bpf_prog_active_addr); + if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + return; + if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_prog_active' not found\n")) + return; + + btf = libbpf_find_kernel_btf(); + if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n", + PTR_ERR(btf))) + return; + + percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu", + BTF_KIND_DATASEC); + if (percpu_datasec < 0) { + printf("%s:SKIP:no PERCPU DATASEC in kernel btf\n", + __func__); + test__skip(); + goto cleanup; + } + + skel = test_ksyms_btf__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) + goto cleanup; + + err = test_ksyms_btf__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + data = skel->data; + CHECK(data->out__runqueues_addr != runqueues_addr, "runqueues_addr", + "got %llu, exp %llu\n", + (unsigned long long)data->out__runqueues_addr, + (unsigned long long)runqueues_addr); + CHECK(data->out__bpf_prog_active_addr != bpf_prog_active_addr, "bpf_prog_active_addr", + "got %llu, exp %llu\n", + (unsigned long long)data->out__bpf_prog_active_addr, + (unsigned long long)bpf_prog_active_addr); + + CHECK(data->out__rq_cpu == -1, "rq_cpu", + "got %u, exp != -1\n", data->out__rq_cpu); + CHECK(data->out__bpf_prog_active < 0, "bpf_prog_active", + "got %d, exp >= 0\n", data->out__bpf_prog_active); + CHECK(data->out__cpu_0_rq_cpu != 0, "cpu_rq(0)->cpu", + "got %u, exp 0\n", data->out__cpu_0_rq_cpu); + + this_rq_cpu = data->out__this_rq_cpu; + CHECK(this_rq_cpu != data->out__rq_cpu, "this_rq_cpu", + "got %u, exp %u\n", this_rq_cpu, data->out__rq_cpu); + + this_bpf_prog_active = data->out__this_bpf_prog_active; + CHECK(this_bpf_prog_active != data->out__bpf_prog_active, "this_bpf_prog_active", + "got %d, exp %d\n", this_bpf_prog_active, + data->out__bpf_prog_active); + +cleanup: + btf__free(btf); + test_ksyms_btf__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c index 041952524c55..fcf54b3a1dd0 100644 --- a/tools/testing/selftests/bpf/prog_tests/pinning.c +++ b/tools/testing/selftests/bpf/prog_tests/pinning.c @@ -37,7 +37,7 @@ void test_pinning(void) struct stat statbuf = {}; struct bpf_object *obj; struct bpf_map *map; - int err; + int err, map_fd; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .pin_root_path = custpath, ); @@ -213,6 +213,53 @@ void test_pinning(void) if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno)) goto out; + /* remove the custom pin path to re-test it with reuse fd below */ + err = unlink(custpinpath); + if (CHECK(err, "unlink custpinpath", "err %d errno %d\n", err, errno)) + goto out; + + err = rmdir(custpath); + if (CHECK(err, "rmdir custpindir", "err %d errno %d\n", err, errno)) + goto out; + + bpf_object__close(obj); + + /* test pinning at custom path with reuse fd */ + obj = bpf_object__open_file(file, NULL); + err = libbpf_get_error(obj); + if (CHECK(err, "default open", "err %d errno %d\n", err, errno)) { + obj = NULL; + goto out; + } + + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(__u32), + sizeof(__u64), 1, 0); + if (CHECK(map_fd < 0, "create pinmap manually", "fd %d\n", map_fd)) + goto out; + + map = bpf_object__find_map_by_name(obj, "pinmap"); + if (CHECK(!map, "find map", "NULL map")) + goto close_map_fd; + + err = bpf_map__reuse_fd(map, map_fd); + if (CHECK(err, "reuse pinmap fd", "err %d errno %d\n", err, errno)) + goto close_map_fd; + + err = bpf_map__set_pin_path(map, custpinpath); + if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno)) + goto close_map_fd; + + err = bpf_object__load(obj); + if (CHECK(err, "custom load", "err %d errno %d\n", err, errno)) + goto close_map_fd; + + /* check that pinmap was pinned at the custom path */ + err = stat(custpinpath, &statbuf); + if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno)) + goto close_map_fd; + +close_map_fd: + close(map_fd); out: unlink(pinpath); unlink(nopinpath); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 4c4224e3e10a..85f73261fab0 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -198,7 +198,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type) { DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); int err, len, src_fd, iter_fd, duration = 0; - union bpf_iter_link_info linfo = {0}; + union bpf_iter_link_info linfo = {}; __u32 i, num_sockets, num_elems; struct bpf_iter_sockmap *skel; __s64 *sock_fd = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c index 24ba0d21b641..c86e67214a9e 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c @@ -264,9 +264,19 @@ static int check_error_linum(const struct sk_fds *sk_fds) static void check_hdr_and_close_fds(struct sk_fds *sk_fds) { + const __u32 expected_inherit_cb_flags = + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_STATE_CB_FLAG; + if (sk_fds_shutdown(sk_fds)) goto check_linum; + if (CHECK(expected_inherit_cb_flags != skel->bss->inherit_cb_flags, + "Unexpected inherit_cb_flags", "0x%x != 0x%x\n", + skel->bss->inherit_cb_flags, expected_inherit_cb_flags)) + goto check_linum; + if (check_hdr_stg(&exp_passive_hdr_stg, sk_fds->passive_fd, "passive_hdr_stg")) goto check_linum; @@ -321,6 +331,8 @@ static void reset_test(void) memset(&skel->bss->active_estab_in, 0, optsize); memset(&skel->bss->active_fin_in, 0, optsize); + skel->bss->inherit_cb_flags = 0; + skel->data->test_kind = TCPOPT_EXP; skel->data->test_magic = 0xeB9F; diff --git a/tools/testing/selftests/bpf/prog_tests/test_profiler.c b/tools/testing/selftests/bpf/prog_tests/test_profiler.c new file mode 100644 index 000000000000..4ca275101ee0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_profiler.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <test_progs.h> +#include "progs/profiler.h" +#include "profiler1.skel.h" +#include "profiler2.skel.h" +#include "profiler3.skel.h" + +static int sanity_run(struct bpf_program *prog) +{ + struct bpf_prog_test_run_attr test_attr = {}; + __u64 args[] = {1, 2, 3}; + __u32 duration = 0; + int err, prog_fd; + + prog_fd = bpf_program__fd(prog); + test_attr.prog_fd = prog_fd; + test_attr.ctx_in = args; + test_attr.ctx_size_in = sizeof(args); + err = bpf_prog_test_run_xattr(&test_attr); + if (CHECK(err || test_attr.retval, "test_run", + "err %d errno %d retval %d duration %d\n", + err, errno, test_attr.retval, duration)) + return -1; + return 0; +} + +void test_test_profiler(void) +{ + struct profiler1 *profiler1_skel = NULL; + struct profiler2 *profiler2_skel = NULL; + struct profiler3 *profiler3_skel = NULL; + __u32 duration = 0; + int err; + + profiler1_skel = profiler1__open_and_load(); + if (CHECK(!profiler1_skel, "profiler1_skel_load", "profiler1 skeleton failed\n")) + goto cleanup; + + err = profiler1__attach(profiler1_skel); + if (CHECK(err, "profiler1_attach", "profiler1 attach failed: %d\n", err)) + goto cleanup; + + if (sanity_run(profiler1_skel->progs.raw_tracepoint__sched_process_exec)) + goto cleanup; + + profiler2_skel = profiler2__open_and_load(); + if (CHECK(!profiler2_skel, "profiler2_skel_load", "profiler2 skeleton failed\n")) + goto cleanup; + + err = profiler2__attach(profiler2_skel); + if (CHECK(err, "profiler2_attach", "profiler2 attach failed: %d\n", err)) + goto cleanup; + + if (sanity_run(profiler2_skel->progs.raw_tracepoint__sched_process_exec)) + goto cleanup; + + profiler3_skel = profiler3__open_and_load(); + if (CHECK(!profiler3_skel, "profiler3_skel_load", "profiler3 skeleton failed\n")) + goto cleanup; + + err = profiler3__attach(profiler3_skel); + if (CHECK(err, "profiler3_attach", "profiler3 attach failed: %d\n", err)) + goto cleanup; + + if (sanity_run(profiler3_skel->progs.raw_tracepoint__sched_process_exec)) + goto cleanup; +cleanup: + profiler1__destroy(profiler1_skel); + profiler2__destroy(profiler2_skel); + profiler3__destroy(profiler3_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c index a1f06424cf83..0281095de266 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c @@ -25,7 +25,7 @@ void test_xdp_noinline(void) __u8 flags; } real_def = {.dst = MAGIC_VAL}; __u32 ch_key = 11, real_num = 3; - __u32 duration, retval, size; + __u32 duration = 0, retval, size; int err, i; __u64 bytes = 0, pkts = 0; char buf[128]; diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index b1b2773c0b9d..a943d394fd3a 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -23,6 +23,10 @@ #define TCP_CA_NAME_MAX 16 #endif +#ifndef TCP_NOTSENT_LOWAT +#define TCP_NOTSENT_LOWAT 25 +#endif + #ifndef IFNAMSIZ #define IFNAMSIZ 16 #endif @@ -128,6 +132,18 @@ static __inline int set_keepalive(struct bpf_sock_addr *ctx) return 0; } +static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx) +{ + int lowat = 65535; + + if (ctx->type == SOCK_STREAM) { + if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat))) + return 1; + } + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -148,6 +164,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) if (set_keepalive(ctx)) return 0; + if (set_notsent_lowat(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) diff --git a/tools/testing/selftests/bpf/progs/profiler.h b/tools/testing/selftests/bpf/progs/profiler.h new file mode 100644 index 000000000000..3bac4fdd4bdf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/profiler.h @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#pragma once + +#define TASK_COMM_LEN 16 +#define MAX_ANCESTORS 4 +#define MAX_PATH 256 +#define KILL_TARGET_LEN 64 +#define CTL_MAXNAME 10 +#define MAX_ARGS_LEN 4096 +#define MAX_FILENAME_LEN 512 +#define MAX_ENVIRON_LEN 8192 +#define MAX_PATH_DEPTH 32 +#define MAX_FILEPATH_LENGTH (MAX_PATH_DEPTH * MAX_PATH) +#define MAX_CGROUPS_PATH_DEPTH 8 + +#define MAX_METADATA_PAYLOAD_LEN TASK_COMM_LEN + +#define MAX_CGROUP_PAYLOAD_LEN \ + (MAX_PATH * 2 + (MAX_PATH * MAX_CGROUPS_PATH_DEPTH)) + +#define MAX_CAP_PAYLOAD_LEN (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN) + +#define MAX_SYSCTL_PAYLOAD_LEN \ + (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + CTL_MAXNAME + MAX_PATH) + +#define MAX_KILL_PAYLOAD_LEN \ + (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + TASK_COMM_LEN + \ + KILL_TARGET_LEN) + +#define MAX_EXEC_PAYLOAD_LEN \ + (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILENAME_LEN + \ + MAX_ARGS_LEN + MAX_ENVIRON_LEN) + +#define MAX_FILEMOD_PAYLOAD_LEN \ + (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILEPATH_LENGTH + \ + MAX_FILEPATH_LENGTH) + +enum data_type { + INVALID_EVENT, + EXEC_EVENT, + FORK_EVENT, + KILL_EVENT, + SYSCTL_EVENT, + FILEMOD_EVENT, + MAX_DATA_TYPE_EVENT +}; + +enum filemod_type { + FMOD_OPEN, + FMOD_LINK, + FMOD_SYMLINK, +}; + +struct ancestors_data_t { + pid_t ancestor_pids[MAX_ANCESTORS]; + uint32_t ancestor_exec_ids[MAX_ANCESTORS]; + uint64_t ancestor_start_times[MAX_ANCESTORS]; + uint32_t num_ancestors; +}; + +struct var_metadata_t { + enum data_type type; + pid_t pid; + uint32_t exec_id; + uid_t uid; + gid_t gid; + uint64_t start_time; + uint32_t cpu_id; + uint64_t bpf_stats_num_perf_events; + uint64_t bpf_stats_start_ktime_ns; + uint8_t comm_length; +}; + +struct cgroup_data_t { + ino_t cgroup_root_inode; + ino_t cgroup_proc_inode; + uint64_t cgroup_root_mtime; + uint64_t cgroup_proc_mtime; + uint16_t cgroup_root_length; + uint16_t cgroup_proc_length; + uint16_t cgroup_full_length; + int cgroup_full_path_root_pos; +}; + +struct var_sysctl_data_t { + struct var_metadata_t meta; + struct cgroup_data_t cgroup_data; + struct ancestors_data_t ancestors_info; + uint8_t sysctl_val_length; + uint16_t sysctl_path_length; + char payload[MAX_SYSCTL_PAYLOAD_LEN]; +}; + +struct var_kill_data_t { + struct var_metadata_t meta; + struct cgroup_data_t cgroup_data; + struct ancestors_data_t ancestors_info; + pid_t kill_target_pid; + int kill_sig; + uint32_t kill_count; + uint64_t last_kill_time; + uint8_t kill_target_name_length; + uint8_t kill_target_cgroup_proc_length; + char payload[MAX_KILL_PAYLOAD_LEN]; + size_t payload_length; +}; + +struct var_exec_data_t { + struct var_metadata_t meta; + struct cgroup_data_t cgroup_data; + pid_t parent_pid; + uint32_t parent_exec_id; + uid_t parent_uid; + uint64_t parent_start_time; + uint16_t bin_path_length; + uint16_t cmdline_length; + uint16_t environment_length; + char payload[MAX_EXEC_PAYLOAD_LEN]; +}; + +struct var_fork_data_t { + struct var_metadata_t meta; + pid_t parent_pid; + uint32_t parent_exec_id; + uint64_t parent_start_time; + char payload[MAX_METADATA_PAYLOAD_LEN]; +}; + +struct var_filemod_data_t { + struct var_metadata_t meta; + struct cgroup_data_t cgroup_data; + enum filemod_type fmod_type; + unsigned int dst_flags; + uint32_t src_device_id; + uint32_t dst_device_id; + ino_t src_inode; + ino_t dst_inode; + uint16_t src_filepath_length; + uint16_t dst_filepath_length; + char payload[MAX_FILEMOD_PAYLOAD_LEN]; +}; + +struct profiler_config_struct { + bool fetch_cgroups_from_bpf; + ino_t cgroup_fs_inode; + ino_t cgroup_login_session_inode; + uint64_t kill_signals_mask; + ino_t inode_filter; + uint32_t stale_info_secs; + bool use_variable_buffers; + bool read_environ_from_exec; + bool enable_cgroup_v1_resolver; +}; + +struct bpf_func_stats_data { + uint64_t time_elapsed_ns; + uint64_t num_executions; + uint64_t num_perf_events; +}; + +struct bpf_func_stats_ctx { + uint64_t start_time_ns; + struct bpf_func_stats_data* bpf_func_stats_data_val; +}; + +enum bpf_function_id { + profiler_bpf_proc_sys_write, + profiler_bpf_sched_process_exec, + profiler_bpf_sched_process_exit, + profiler_bpf_sys_enter_kill, + profiler_bpf_do_filp_open_ret, + profiler_bpf_sched_process_fork, + profiler_bpf_vfs_link, + profiler_bpf_vfs_symlink, + profiler_bpf_max_function_id +}; diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h new file mode 100644 index 000000000000..00578311a423 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -0,0 +1,969 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <vmlinux.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#include "profiler.h" + +#ifndef NULL +#define NULL 0 +#endif + +#define O_WRONLY 00000001 +#define O_RDWR 00000002 +#define O_DIRECTORY 00200000 +#define __O_TMPFILE 020000000 +#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +#define MAX_ERRNO 4095 +#define S_IFMT 00170000 +#define S_IFSOCK 0140000 +#define S_IFLNK 0120000 +#define S_IFREG 0100000 +#define S_IFBLK 0060000 +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFIFO 0010000 +#define S_ISUID 0004000 +#define S_ISGID 0002000 +#define S_ISVTX 0001000 +#define S_ISLNK(m) (((m)&S_IFMT) == S_IFLNK) +#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR) +#define S_ISCHR(m) (((m)&S_IFMT) == S_IFCHR) +#define S_ISBLK(m) (((m)&S_IFMT) == S_IFBLK) +#define S_ISFIFO(m) (((m)&S_IFMT) == S_IFIFO) +#define S_ISSOCK(m) (((m)&S_IFMT) == S_IFSOCK) +#define IS_ERR_VALUE(x) (unsigned long)(void*)(x) >= (unsigned long)-MAX_ERRNO + +#define KILL_DATA_ARRAY_SIZE 8 + +struct var_kill_data_arr_t { + struct var_kill_data_t array[KILL_DATA_ARRAY_SIZE]; +}; + +union any_profiler_data_t { + struct var_exec_data_t var_exec; + struct var_kill_data_t var_kill; + struct var_sysctl_data_t var_sysctl; + struct var_filemod_data_t var_filemod; + struct var_fork_data_t var_fork; + struct var_kill_data_arr_t var_kill_data_arr; +}; + +volatile struct profiler_config_struct bpf_config = {}; + +#define FETCH_CGROUPS_FROM_BPF (bpf_config.fetch_cgroups_from_bpf) +#define CGROUP_FS_INODE (bpf_config.cgroup_fs_inode) +#define CGROUP_LOGIN_SESSION_INODE \ + (bpf_config.cgroup_login_session_inode) +#define KILL_SIGNALS (bpf_config.kill_signals_mask) +#define STALE_INFO (bpf_config.stale_info_secs) +#define INODE_FILTER (bpf_config.inode_filter) +#define READ_ENVIRON_FROM_EXEC (bpf_config.read_environ_from_exec) +#define ENABLE_CGROUP_V1_RESOLVER (bpf_config.enable_cgroup_v1_resolver) + +struct kernfs_iattrs___52 { + struct iattr ia_iattr; +}; + +struct kernfs_node___52 { + union /* kernfs_node_id */ { + struct { + u32 ino; + u32 generation; + }; + u64 id; + } id; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, union any_profiler_data_t); +} data_heap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} events SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, KILL_DATA_ARRAY_SIZE); + __type(key, u32); + __type(value, struct var_kill_data_arr_t); +} var_tpid_to_data SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, profiler_bpf_max_function_id); + __type(key, u32); + __type(value, struct bpf_func_stats_data); +} bpf_func_stats SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, bool); + __uint(max_entries, 16); +} allowed_devices SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, bool); + __uint(max_entries, 1024); +} allowed_file_inodes SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, bool); + __uint(max_entries, 1024); +} allowed_directory_inodes SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, bool); + __uint(max_entries, 16); +} disallowed_exec_inodes SEC(".maps"); + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) +#endif + +static INLINE bool IS_ERR(const void* ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static INLINE u32 get_userspace_pid() +{ + return bpf_get_current_pid_tgid() >> 32; +} + +static INLINE bool is_init_process(u32 tgid) +{ + return tgid == 1 || tgid == 0; +} + +static INLINE unsigned long +probe_read_lim(void* dst, void* src, unsigned long len, unsigned long max) +{ + len = len < max ? len : max; + if (len > 1) { + if (bpf_probe_read(dst, len, src)) + return 0; + } else if (len == 1) { + if (bpf_probe_read(dst, 1, src)) + return 0; + } + return len; +} + +static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct, + int spid) +{ +#ifdef UNROLL +#pragma unroll +#endif + for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) + if (arr_struct->array[i].meta.pid == spid) + return i; + return -1; +} + +static INLINE void populate_ancestors(struct task_struct* task, + struct ancestors_data_t* ancestors_data) +{ + struct task_struct* parent = task; + u32 num_ancestors, ppid; + + ancestors_data->num_ancestors = 0; +#ifdef UNROLL +#pragma unroll +#endif + for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) { + parent = BPF_CORE_READ(parent, real_parent); + if (parent == NULL) + break; + ppid = BPF_CORE_READ(parent, tgid); + if (is_init_process(ppid)) + break; + ancestors_data->ancestor_pids[num_ancestors] = ppid; + ancestors_data->ancestor_exec_ids[num_ancestors] = + BPF_CORE_READ(parent, self_exec_id); + ancestors_data->ancestor_start_times[num_ancestors] = + BPF_CORE_READ(parent, start_time); + ancestors_data->num_ancestors = num_ancestors; + } +} + +static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node, + struct kernfs_node* cgroup_root_node, + void* payload, + int* root_pos) +{ + void* payload_start = payload; + size_t filepart_length; + +#ifdef UNROLL +#pragma unroll +#endif + for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) { + filepart_length = + bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(cgroup_node, name)); + if (!cgroup_node) + return payload; + if (cgroup_node == cgroup_root_node) + *root_pos = payload - payload_start; + if (filepart_length <= MAX_PATH) { + barrier_var(filepart_length); + payload += filepart_length; + } + cgroup_node = BPF_CORE_READ(cgroup_node, parent); + } + return payload; +} + +static ino_t get_inode_from_kernfs(struct kernfs_node* node) +{ + struct kernfs_node___52* node52 = (void*)node; + + if (bpf_core_field_exists(node52->id.ino)) { + barrier_var(node52); + return BPF_CORE_READ(node52, id.ino); + } else { + barrier_var(node); + return (u64)BPF_CORE_READ(node, id); + } +} + +int pids_cgrp_id = 1; + +static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, + struct task_struct* task, + void* payload) +{ + struct kernfs_node* root_kernfs = + BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn); + struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); + + if (ENABLE_CGROUP_V1_RESOLVER) { +#ifdef UNROLL +#pragma unroll +#endif + for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys_state* subsys = + BPF_CORE_READ(task, cgroups, subsys[i]); + if (subsys != NULL) { + int subsys_id = BPF_CORE_READ(subsys, ss, id); + if (subsys_id == pids_cgrp_id) { + proc_kernfs = BPF_CORE_READ(subsys, cgroup, kn); + root_kernfs = BPF_CORE_READ(subsys, ss, root, kf_root, kn); + break; + } + } + } + } + + cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs); + cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs); + + if (bpf_core_field_exists(root_kernfs->iattr->ia_mtime)) { + cgroup_data->cgroup_root_mtime = + BPF_CORE_READ(root_kernfs, iattr, ia_mtime.tv_nsec); + cgroup_data->cgroup_proc_mtime = + BPF_CORE_READ(proc_kernfs, iattr, ia_mtime.tv_nsec); + } else { + struct kernfs_iattrs___52* root_iattr = + (struct kernfs_iattrs___52*)BPF_CORE_READ(root_kernfs, iattr); + cgroup_data->cgroup_root_mtime = + BPF_CORE_READ(root_iattr, ia_iattr.ia_mtime.tv_nsec); + + struct kernfs_iattrs___52* proc_iattr = + (struct kernfs_iattrs___52*)BPF_CORE_READ(proc_kernfs, iattr); + cgroup_data->cgroup_proc_mtime = + BPF_CORE_READ(proc_iattr, ia_iattr.ia_mtime.tv_nsec); + } + + cgroup_data->cgroup_root_length = 0; + cgroup_data->cgroup_proc_length = 0; + cgroup_data->cgroup_full_length = 0; + + size_t cgroup_root_length = + bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(root_kernfs, name)); + barrier_var(cgroup_root_length); + if (cgroup_root_length <= MAX_PATH) { + barrier_var(cgroup_root_length); + cgroup_data->cgroup_root_length = cgroup_root_length; + payload += cgroup_root_length; + } + + size_t cgroup_proc_length = + bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(proc_kernfs, name)); + barrier_var(cgroup_proc_length); + if (cgroup_proc_length <= MAX_PATH) { + barrier_var(cgroup_proc_length); + cgroup_data->cgroup_proc_length = cgroup_proc_length; + payload += cgroup_proc_length; + } + + if (FETCH_CGROUPS_FROM_BPF) { + cgroup_data->cgroup_full_path_root_pos = -1; + void* payload_end_pos = read_full_cgroup_path(proc_kernfs, root_kernfs, payload, + &cgroup_data->cgroup_full_path_root_pos); + cgroup_data->cgroup_full_length = payload_end_pos - payload; + payload = payload_end_pos; + } + + return (void*)payload; +} + +static INLINE void* populate_var_metadata(struct var_metadata_t* metadata, + struct task_struct* task, + u32 pid, void* payload) +{ + u64 uid_gid = bpf_get_current_uid_gid(); + + metadata->uid = (u32)uid_gid; + metadata->gid = uid_gid >> 32; + metadata->pid = pid; + metadata->exec_id = BPF_CORE_READ(task, self_exec_id); + metadata->start_time = BPF_CORE_READ(task, start_time); + metadata->comm_length = 0; + + size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm); + barrier_var(comm_length); + if (comm_length <= TASK_COMM_LEN) { + barrier_var(comm_length); + metadata->comm_length = comm_length; + payload += comm_length; + } + + return (void*)payload; +} + +static INLINE struct var_kill_data_t* +get_var_kill_data(struct pt_regs* ctx, int spid, int tpid, int sig) +{ + int zero = 0; + struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero); + + if (kill_data == NULL) + return NULL; + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + + void* payload = populate_var_metadata(&kill_data->meta, task, spid, kill_data->payload); + payload = populate_cgroup_info(&kill_data->cgroup_data, task, payload); + size_t payload_length = payload - (void*)kill_data->payload; + kill_data->payload_length = payload_length; + populate_ancestors(task, &kill_data->ancestors_info); + kill_data->meta.type = KILL_EVENT; + kill_data->kill_target_pid = tpid; + kill_data->kill_sig = sig; + kill_data->kill_count = 1; + kill_data->last_kill_time = bpf_ktime_get_ns(); + return kill_data; +} + +static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig) +{ + if ((KILL_SIGNALS & (1ULL << sig)) == 0) + return 0; + + u32 spid = get_userspace_pid(); + struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid); + + if (arr_struct == NULL) { + struct var_kill_data_t* kill_data = get_var_kill_data(ctx, spid, tpid, sig); + int zero = 0; + + if (kill_data == NULL) + return 0; + arr_struct = bpf_map_lookup_elem(&data_heap, &zero); + if (arr_struct == NULL) + return 0; + bpf_probe_read(&arr_struct->array[0], sizeof(arr_struct->array[0]), kill_data); + } else { + int index = get_var_spid_index(arr_struct, spid); + + if (index == -1) { + struct var_kill_data_t* kill_data = + get_var_kill_data(ctx, spid, tpid, sig); + if (kill_data == NULL) + return 0; +#ifdef UNROLL +#pragma unroll +#endif + for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) + if (arr_struct->array[i].meta.pid == 0) { + bpf_probe_read(&arr_struct->array[i], + sizeof(arr_struct->array[i]), kill_data); + bpf_map_update_elem(&var_tpid_to_data, &tpid, + arr_struct, 0); + + return 0; + } + return 0; + } + + struct var_kill_data_t* kill_data = &arr_struct->array[index]; + + u64 delta_sec = + (bpf_ktime_get_ns() - kill_data->last_kill_time) / 1000000000; + + if (delta_sec < STALE_INFO) { + kill_data->kill_count++; + kill_data->last_kill_time = bpf_ktime_get_ns(); + bpf_probe_read(&arr_struct->array[index], + sizeof(arr_struct->array[index]), + kill_data); + } else { + struct var_kill_data_t* kill_data = + get_var_kill_data(ctx, spid, tpid, sig); + if (kill_data == NULL) + return 0; + bpf_probe_read(&arr_struct->array[index], + sizeof(arr_struct->array[index]), + kill_data); + } + } + bpf_map_update_elem(&var_tpid_to_data, &tpid, arr_struct, 0); + return 0; +} + +static INLINE void bpf_stats_enter(struct bpf_func_stats_ctx* bpf_stat_ctx, + enum bpf_function_id func_id) +{ + int func_id_key = func_id; + + bpf_stat_ctx->start_time_ns = bpf_ktime_get_ns(); + bpf_stat_ctx->bpf_func_stats_data_val = + bpf_map_lookup_elem(&bpf_func_stats, &func_id_key); + if (bpf_stat_ctx->bpf_func_stats_data_val) + bpf_stat_ctx->bpf_func_stats_data_val->num_executions++; +} + +static INLINE void bpf_stats_exit(struct bpf_func_stats_ctx* bpf_stat_ctx) +{ + if (bpf_stat_ctx->bpf_func_stats_data_val) + bpf_stat_ctx->bpf_func_stats_data_val->time_elapsed_ns += + bpf_ktime_get_ns() - bpf_stat_ctx->start_time_ns; +} + +static INLINE void +bpf_stats_pre_submit_var_perf_event(struct bpf_func_stats_ctx* bpf_stat_ctx, + struct var_metadata_t* meta) +{ + if (bpf_stat_ctx->bpf_func_stats_data_val) { + bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events++; + meta->bpf_stats_num_perf_events = + bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events; + } + meta->bpf_stats_start_ktime_ns = bpf_stat_ctx->start_time_ns; + meta->cpu_id = bpf_get_smp_processor_id(); +} + +static INLINE size_t +read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload) +{ + size_t length = 0; + size_t filepart_length; + struct dentry* parent_dentry; + +#ifdef UNROLL +#pragma unroll +#endif + for (int i = 0; i < MAX_PATH_DEPTH; i++) { + filepart_length = bpf_probe_read_str(payload, MAX_PATH, + BPF_CORE_READ(filp_dentry, d_name.name)); + barrier_var(filepart_length); + if (filepart_length > MAX_PATH) + break; + barrier_var(filepart_length); + payload += filepart_length; + length += filepart_length; + + parent_dentry = BPF_CORE_READ(filp_dentry, d_parent); + if (filp_dentry == parent_dentry) + break; + filp_dentry = parent_dentry; + } + + return length; +} + +static INLINE bool +is_ancestor_in_allowed_inodes(struct dentry* filp_dentry) +{ + struct dentry* parent_dentry; +#ifdef UNROLL +#pragma unroll +#endif + for (int i = 0; i < MAX_PATH_DEPTH; i++) { + u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino); + bool* allowed_dir = bpf_map_lookup_elem(&allowed_directory_inodes, &dir_ino); + + if (allowed_dir != NULL) + return true; + parent_dentry = BPF_CORE_READ(filp_dentry, d_parent); + if (filp_dentry == parent_dentry) + break; + filp_dentry = parent_dentry; + } + return false; +} + +static INLINE bool is_dentry_allowed_for_filemod(struct dentry* file_dentry, + u32* device_id, + u64* file_ino) +{ + u32 dev_id = BPF_CORE_READ(file_dentry, d_sb, s_dev); + *device_id = dev_id; + bool* allowed_device = bpf_map_lookup_elem(&allowed_devices, &dev_id); + + if (allowed_device == NULL) + return false; + + u64 ino = BPF_CORE_READ(file_dentry, d_inode, i_ino); + *file_ino = ino; + bool* allowed_file = bpf_map_lookup_elem(&allowed_file_inodes, &ino); + + if (allowed_file == NULL) + if (!is_ancestor_in_allowed_inodes(BPF_CORE_READ(file_dentry, d_parent))) + return false; + return true; +} + +SEC("kprobe/proc_sys_write") +ssize_t BPF_KPROBE(kprobe__proc_sys_write, + struct file* filp, const char* buf, + size_t count, loff_t* ppos) +{ + struct bpf_func_stats_ctx stats_ctx; + bpf_stats_enter(&stats_ctx, profiler_bpf_proc_sys_write); + + u32 pid = get_userspace_pid(); + int zero = 0; + struct var_sysctl_data_t* sysctl_data = + bpf_map_lookup_elem(&data_heap, &zero); + if (!sysctl_data) + goto out; + + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + sysctl_data->meta.type = SYSCTL_EVENT; + void* payload = populate_var_metadata(&sysctl_data->meta, task, pid, sysctl_data->payload); + payload = populate_cgroup_info(&sysctl_data->cgroup_data, task, payload); + + populate_ancestors(task, &sysctl_data->ancestors_info); + + sysctl_data->sysctl_val_length = 0; + sysctl_data->sysctl_path_length = 0; + + size_t sysctl_val_length = bpf_probe_read_str(payload, CTL_MAXNAME, buf); + barrier_var(sysctl_val_length); + if (sysctl_val_length <= CTL_MAXNAME) { + barrier_var(sysctl_val_length); + sysctl_data->sysctl_val_length = sysctl_val_length; + payload += sysctl_val_length; + } + + size_t sysctl_path_length = bpf_probe_read_str(payload, MAX_PATH, + BPF_CORE_READ(filp, f_path.dentry, d_name.name)); + barrier_var(sysctl_path_length); + if (sysctl_path_length <= MAX_PATH) { + barrier_var(sysctl_path_length); + sysctl_data->sysctl_path_length = sysctl_path_length; + payload += sysctl_path_length; + } + + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &sysctl_data->meta); + unsigned long data_len = payload - (void*)sysctl_data; + data_len = data_len > sizeof(struct var_sysctl_data_t) + ? sizeof(struct var_sysctl_data_t) + : data_len; + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, sysctl_data, data_len); +out: + bpf_stats_exit(&stats_ctx); + return 0; +} + +SEC("tracepoint/syscalls/sys_enter_kill") +int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx) +{ + struct bpf_func_stats_ctx stats_ctx; + + bpf_stats_enter(&stats_ctx, profiler_bpf_sys_enter_kill); + int pid = ctx->args[0]; + int sig = ctx->args[1]; + int ret = trace_var_sys_kill(ctx, pid, sig); + bpf_stats_exit(&stats_ctx); + return ret; +}; + +SEC("raw_tracepoint/sched_process_exit") +int raw_tracepoint__sched_process_exit(void* ctx) +{ + int zero = 0; + struct bpf_func_stats_ctx stats_ctx; + bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exit); + + u32 tpid = get_userspace_pid(); + + struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid); + struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero); + + if (arr_struct == NULL || kill_data == NULL) + goto out; + + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); + +#ifdef UNROLL +#pragma unroll +#endif + for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) { + struct var_kill_data_t* past_kill_data = &arr_struct->array[i]; + + if (past_kill_data != NULL && past_kill_data->kill_target_pid == tpid) { + bpf_probe_read(kill_data, sizeof(*past_kill_data), past_kill_data); + void* payload = kill_data->payload; + size_t offset = kill_data->payload_length; + if (offset >= MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN) + return 0; + payload += offset; + + kill_data->kill_target_name_length = 0; + kill_data->kill_target_cgroup_proc_length = 0; + + size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm); + barrier_var(comm_length); + if (comm_length <= TASK_COMM_LEN) { + barrier_var(comm_length); + kill_data->kill_target_name_length = comm_length; + payload += comm_length; + } + + size_t cgroup_proc_length = bpf_probe_read_str(payload, KILL_TARGET_LEN, + BPF_CORE_READ(proc_kernfs, name)); + barrier_var(cgroup_proc_length); + if (cgroup_proc_length <= KILL_TARGET_LEN) { + barrier_var(cgroup_proc_length); + kill_data->kill_target_cgroup_proc_length = cgroup_proc_length; + payload += cgroup_proc_length; + } + + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &kill_data->meta); + unsigned long data_len = (void*)payload - (void*)kill_data; + data_len = data_len > sizeof(struct var_kill_data_t) + ? sizeof(struct var_kill_data_t) + : data_len; + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, kill_data, data_len); + } + } + bpf_map_delete_elem(&var_tpid_to_data, &tpid); +out: + bpf_stats_exit(&stats_ctx); + return 0; +} + +SEC("raw_tracepoint/sched_process_exec") +int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx) +{ + struct bpf_func_stats_ctx stats_ctx; + bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exec); + + struct linux_binprm* bprm = (struct linux_binprm*)ctx->args[2]; + u64 inode = BPF_CORE_READ(bprm, file, f_inode, i_ino); + + bool* should_filter_binprm = bpf_map_lookup_elem(&disallowed_exec_inodes, &inode); + if (should_filter_binprm != NULL) + goto out; + + int zero = 0; + struct var_exec_data_t* proc_exec_data = bpf_map_lookup_elem(&data_heap, &zero); + if (!proc_exec_data) + goto out; + + if (INODE_FILTER && inode != INODE_FILTER) + return 0; + + u32 pid = get_userspace_pid(); + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + + proc_exec_data->meta.type = EXEC_EVENT; + proc_exec_data->bin_path_length = 0; + proc_exec_data->cmdline_length = 0; + proc_exec_data->environment_length = 0; + void* payload = populate_var_metadata(&proc_exec_data->meta, task, pid, + proc_exec_data->payload); + payload = populate_cgroup_info(&proc_exec_data->cgroup_data, task, payload); + + struct task_struct* parent_task = BPF_CORE_READ(task, real_parent); + proc_exec_data->parent_pid = BPF_CORE_READ(parent_task, tgid); + proc_exec_data->parent_uid = BPF_CORE_READ(parent_task, real_cred, uid.val); + proc_exec_data->parent_exec_id = BPF_CORE_READ(parent_task, self_exec_id); + proc_exec_data->parent_start_time = BPF_CORE_READ(parent_task, start_time); + + const char* filename = BPF_CORE_READ(bprm, filename); + size_t bin_path_length = bpf_probe_read_str(payload, MAX_FILENAME_LEN, filename); + barrier_var(bin_path_length); + if (bin_path_length <= MAX_FILENAME_LEN) { + barrier_var(bin_path_length); + proc_exec_data->bin_path_length = bin_path_length; + payload += bin_path_length; + } + + void* arg_start = (void*)BPF_CORE_READ(task, mm, arg_start); + void* arg_end = (void*)BPF_CORE_READ(task, mm, arg_end); + unsigned int cmdline_length = probe_read_lim(payload, arg_start, + arg_end - arg_start, MAX_ARGS_LEN); + + if (cmdline_length <= MAX_ARGS_LEN) { + barrier_var(cmdline_length); + proc_exec_data->cmdline_length = cmdline_length; + payload += cmdline_length; + } + + if (READ_ENVIRON_FROM_EXEC) { + void* env_start = (void*)BPF_CORE_READ(task, mm, env_start); + void* env_end = (void*)BPF_CORE_READ(task, mm, env_end); + unsigned long env_len = probe_read_lim(payload, env_start, + env_end - env_start, MAX_ENVIRON_LEN); + if (cmdline_length <= MAX_ENVIRON_LEN) { + proc_exec_data->environment_length = env_len; + payload += env_len; + } + } + + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &proc_exec_data->meta); + unsigned long data_len = payload - (void*)proc_exec_data; + data_len = data_len > sizeof(struct var_exec_data_t) + ? sizeof(struct var_exec_data_t) + : data_len; + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, proc_exec_data, data_len); +out: + bpf_stats_exit(&stats_ctx); + return 0; +} + +SEC("kretprobe/do_filp_open") +int kprobe_ret__do_filp_open(struct pt_regs* ctx) +{ + struct bpf_func_stats_ctx stats_ctx; + bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret); + + struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx); + + if (filp == NULL || IS_ERR(filp)) + goto out; + unsigned int flags = BPF_CORE_READ(filp, f_flags); + if ((flags & (O_RDWR | O_WRONLY)) == 0) + goto out; + if ((flags & O_TMPFILE) > 0) + goto out; + struct inode* file_inode = BPF_CORE_READ(filp, f_inode); + umode_t mode = BPF_CORE_READ(file_inode, i_mode); + if (S_ISDIR(mode) || S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || + S_ISSOCK(mode)) + goto out; + + struct dentry* filp_dentry = BPF_CORE_READ(filp, f_path.dentry); + u32 device_id = 0; + u64 file_ino = 0; + if (!is_dentry_allowed_for_filemod(filp_dentry, &device_id, &file_ino)) + goto out; + + int zero = 0; + struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero); + if (!filemod_data) + goto out; + + u32 pid = get_userspace_pid(); + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + + filemod_data->meta.type = FILEMOD_EVENT; + filemod_data->fmod_type = FMOD_OPEN; + filemod_data->dst_flags = flags; + filemod_data->src_inode = 0; + filemod_data->dst_inode = file_ino; + filemod_data->src_device_id = 0; + filemod_data->dst_device_id = device_id; + filemod_data->src_filepath_length = 0; + filemod_data->dst_filepath_length = 0; + + void* payload = populate_var_metadata(&filemod_data->meta, task, pid, + filemod_data->payload); + payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload); + + size_t len = read_absolute_file_path_from_dentry(filp_dentry, payload); + barrier_var(len); + if (len <= MAX_FILEPATH_LENGTH) { + barrier_var(len); + payload += len; + filemod_data->dst_filepath_length = len; + } + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta); + unsigned long data_len = payload - (void*)filemod_data; + data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len; + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len); +out: + bpf_stats_exit(&stats_ctx); + return 0; +} + +SEC("kprobe/vfs_link") +int BPF_KPROBE(kprobe__vfs_link, + struct dentry* old_dentry, struct inode* dir, + struct dentry* new_dentry, struct inode** delegated_inode) +{ + struct bpf_func_stats_ctx stats_ctx; + bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_link); + + u32 src_device_id = 0; + u64 src_file_ino = 0; + u32 dst_device_id = 0; + u64 dst_file_ino = 0; + if (!is_dentry_allowed_for_filemod(old_dentry, &src_device_id, &src_file_ino) && + !is_dentry_allowed_for_filemod(new_dentry, &dst_device_id, &dst_file_ino)) + goto out; + + int zero = 0; + struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero); + if (!filemod_data) + goto out; + + u32 pid = get_userspace_pid(); + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + + filemod_data->meta.type = FILEMOD_EVENT; + filemod_data->fmod_type = FMOD_LINK; + filemod_data->dst_flags = 0; + filemod_data->src_inode = src_file_ino; + filemod_data->dst_inode = dst_file_ino; + filemod_data->src_device_id = src_device_id; + filemod_data->dst_device_id = dst_device_id; + filemod_data->src_filepath_length = 0; + filemod_data->dst_filepath_length = 0; + + void* payload = populate_var_metadata(&filemod_data->meta, task, pid, + filemod_data->payload); + payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload); + + size_t len = read_absolute_file_path_from_dentry(old_dentry, payload); + barrier_var(len); + if (len <= MAX_FILEPATH_LENGTH) { + barrier_var(len); + payload += len; + filemod_data->src_filepath_length = len; + } + + len = read_absolute_file_path_from_dentry(new_dentry, payload); + barrier_var(len); + if (len <= MAX_FILEPATH_LENGTH) { + barrier_var(len); + payload += len; + filemod_data->dst_filepath_length = len; + } + + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta); + unsigned long data_len = payload - (void*)filemod_data; + data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len; + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len); +out: + bpf_stats_exit(&stats_ctx); + return 0; +} + +SEC("kprobe/vfs_symlink") +int BPF_KPROBE(kprobe__vfs_symlink, struct inode* dir, struct dentry* dentry, + const char* oldname) +{ + struct bpf_func_stats_ctx stats_ctx; + bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_symlink); + + u32 dst_device_id = 0; + u64 dst_file_ino = 0; + if (!is_dentry_allowed_for_filemod(dentry, &dst_device_id, &dst_file_ino)) + goto out; + + int zero = 0; + struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero); + if (!filemod_data) + goto out; + + u32 pid = get_userspace_pid(); + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + + filemod_data->meta.type = FILEMOD_EVENT; + filemod_data->fmod_type = FMOD_SYMLINK; + filemod_data->dst_flags = 0; + filemod_data->src_inode = 0; + filemod_data->dst_inode = dst_file_ino; + filemod_data->src_device_id = 0; + filemod_data->dst_device_id = dst_device_id; + filemod_data->src_filepath_length = 0; + filemod_data->dst_filepath_length = 0; + + void* payload = populate_var_metadata(&filemod_data->meta, task, pid, + filemod_data->payload); + payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload); + + size_t len = bpf_probe_read_str(payload, MAX_FILEPATH_LENGTH, oldname); + barrier_var(len); + if (len <= MAX_FILEPATH_LENGTH) { + barrier_var(len); + payload += len; + filemod_data->src_filepath_length = len; + } + len = read_absolute_file_path_from_dentry(dentry, payload); + barrier_var(len); + if (len <= MAX_FILEPATH_LENGTH) { + barrier_var(len); + payload += len; + filemod_data->dst_filepath_length = len; + } + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta); + unsigned long data_len = payload - (void*)filemod_data; + data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len; + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len); +out: + bpf_stats_exit(&stats_ctx); + return 0; +} + +SEC("raw_tracepoint/sched_process_fork") +int raw_tracepoint__sched_process_fork(struct bpf_raw_tracepoint_args* ctx) +{ + struct bpf_func_stats_ctx stats_ctx; + bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_fork); + + int zero = 0; + struct var_fork_data_t* fork_data = bpf_map_lookup_elem(&data_heap, &zero); + if (!fork_data) + goto out; + + struct task_struct* parent = (struct task_struct*)ctx->args[0]; + struct task_struct* child = (struct task_struct*)ctx->args[1]; + fork_data->meta.type = FORK_EVENT; + + void* payload = populate_var_metadata(&fork_data->meta, child, + BPF_CORE_READ(child, pid), fork_data->payload); + fork_data->parent_pid = BPF_CORE_READ(parent, pid); + fork_data->parent_exec_id = BPF_CORE_READ(parent, self_exec_id); + fork_data->parent_start_time = BPF_CORE_READ(parent, start_time); + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &fork_data->meta); + + unsigned long data_len = payload - (void*)fork_data; + data_len = data_len > sizeof(*fork_data) ? sizeof(*fork_data) : data_len; + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, fork_data, data_len); +out: + bpf_stats_exit(&stats_ctx); + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/profiler1.c b/tools/testing/selftests/bpf/progs/profiler1.c new file mode 100644 index 000000000000..4df9088bfc00 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/profiler1.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var)) +#define UNROLL +#define INLINE __always_inline +#include "profiler.inc.h" diff --git a/tools/testing/selftests/bpf/progs/profiler2.c b/tools/testing/selftests/bpf/progs/profiler2.c new file mode 100644 index 000000000000..0f32a3cbf556 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/profiler2.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define barrier_var(var) /**/ +/* undef #define UNROLL */ +#define INLINE /**/ +#include "profiler.inc.h" diff --git a/tools/testing/selftests/bpf/progs/profiler3.c b/tools/testing/selftests/bpf/progs/profiler3.c new file mode 100644 index 000000000000..6249fc31ccb0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/profiler3.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define barrier_var(var) /**/ +#define UNROLL +#define INLINE __noinline +#include "profiler.inc.h" diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c index 193fe0198b21..c1e0c8c7c55f 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c +++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c @@ -41,6 +41,43 @@ struct outer_arr { .values = { (void *)&inner_map1, 0, (void *)&inner_map2 }, }; +struct inner_map_sz3 { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_INNER_MAP); + __uint(max_entries, 3); + __type(key, int); + __type(value, int); +} inner_map3 SEC(".maps"), + inner_map4 SEC(".maps"); + +struct inner_map_sz4 { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_INNER_MAP); + __uint(max_entries, 5); + __type(key, int); + __type(value, int); +} inner_map5 SEC(".maps"); + +struct outer_arr_dyn { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 3); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_INNER_MAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); + }); +} outer_arr_dyn SEC(".maps") = { + .values = { + [0] = (void *)&inner_map3, + [1] = (void *)&inner_map4, + [2] = (void *)&inner_map5, + }, +}; + struct outer_hash { __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); __uint(max_entries, 5); @@ -101,6 +138,12 @@ int handle__sys_enter(void *ctx) val = input + 1; bpf_map_update_elem(inner_map, &key, &val, 0); + inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); + if (!inner_map) + return 1; + val = input + 2; + bpf_map_update_elem(inner_map, &key, &val, 0); + return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_core_autosize.c b/tools/testing/selftests/bpf/progs/test_core_autosize.c new file mode 100644 index 000000000000..44f5aa2e8956 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_core_autosize.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <linux/bpf.h> +#include <stdint.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> + +char _license[] SEC("license") = "GPL"; + +/* fields of exactly the same size */ +struct test_struct___samesize { + void *ptr; + unsigned long long val1; + unsigned int val2; + unsigned short val3; + unsigned char val4; +} __attribute((preserve_access_index)); + +/* unsigned fields that have to be downsized by libbpf */ +struct test_struct___downsize { + void *ptr; + unsigned long val1; + unsigned long val2; + unsigned long val3; + unsigned long val4; + /* total sz: 40 */ +} __attribute__((preserve_access_index)); + +/* fields with signed integers of wrong size, should be rejected */ +struct test_struct___signed { + void *ptr; + long val1; + long val2; + long val3; + long val4; +} __attribute((preserve_access_index)); + +/* real layout and sizes according to test's (32-bit) BTF */ +struct test_struct___real { + unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */ + unsigned int val2; + unsigned long long val1; + unsigned short val3; + unsigned char val4; + unsigned char _pad; + /* total sz: 20 */ +}; + +struct test_struct___real input = { + .ptr = 0x01020304, + .val1 = 0x1020304050607080, + .val2 = 0x0a0b0c0d, + .val3 = 0xfeed, + .val4 = 0xb9, + ._pad = 0xff, /* make sure no accidental zeros are present */ +}; + +unsigned long long ptr_samesized = 0; +unsigned long long val1_samesized = 0; +unsigned long long val2_samesized = 0; +unsigned long long val3_samesized = 0; +unsigned long long val4_samesized = 0; +struct test_struct___real output_samesized = {}; + +unsigned long long ptr_downsized = 0; +unsigned long long val1_downsized = 0; +unsigned long long val2_downsized = 0; +unsigned long long val3_downsized = 0; +unsigned long long val4_downsized = 0; +struct test_struct___real output_downsized = {}; + +unsigned long long ptr_probed = 0; +unsigned long long val1_probed = 0; +unsigned long long val2_probed = 0; +unsigned long long val3_probed = 0; +unsigned long long val4_probed = 0; + +unsigned long long ptr_signed = 0; +unsigned long long val1_signed = 0; +unsigned long long val2_signed = 0; +unsigned long long val3_signed = 0; +unsigned long long val4_signed = 0; +struct test_struct___real output_signed = {}; + +SEC("raw_tp/sys_exit") +int handle_samesize(void *ctx) +{ + struct test_struct___samesize *in = (void *)&input; + struct test_struct___samesize *out = (void *)&output_samesized; + + ptr_samesized = (unsigned long long)in->ptr; + val1_samesized = in->val1; + val2_samesized = in->val2; + val3_samesized = in->val3; + val4_samesized = in->val4; + + out->ptr = in->ptr; + out->val1 = in->val1; + out->val2 = in->val2; + out->val3 = in->val3; + out->val4 = in->val4; + + return 0; +} + +SEC("raw_tp/sys_exit") +int handle_downsize(void *ctx) +{ + struct test_struct___downsize *in = (void *)&input; + struct test_struct___downsize *out = (void *)&output_downsized; + + ptr_downsized = (unsigned long long)in->ptr; + val1_downsized = in->val1; + val2_downsized = in->val2; + val3_downsized = in->val3; + val4_downsized = in->val4; + + out->ptr = in->ptr; + out->val1 = in->val1; + out->val2 = in->val2; + out->val3 = in->val3; + out->val4 = in->val4; + + return 0; +} + +SEC("raw_tp/sys_enter") +int handle_probed(void *ctx) +{ + struct test_struct___downsize *in = (void *)&input; + __u64 tmp; + + tmp = 0; + bpf_core_read(&tmp, bpf_core_field_size(in->ptr), &in->ptr); + ptr_probed = tmp; + + tmp = 0; + bpf_core_read(&tmp, bpf_core_field_size(in->val1), &in->val1); + val1_probed = tmp; + + tmp = 0; + bpf_core_read(&tmp, bpf_core_field_size(in->val2), &in->val2); + val2_probed = tmp; + + tmp = 0; + bpf_core_read(&tmp, bpf_core_field_size(in->val3), &in->val3); + val3_probed = tmp; + + tmp = 0; + bpf_core_read(&tmp, bpf_core_field_size(in->val4), &in->val4); + val4_probed = tmp; + + return 0; +} + +SEC("raw_tp/sys_enter") +int handle_signed(void *ctx) +{ + struct test_struct___signed *in = (void *)&input; + struct test_struct___signed *out = (void *)&output_signed; + + val2_signed = in->val2; + val3_signed = in->val3; + val4_signed = in->val4; + + out->val2= in->val2; + out->val3= in->val3; + out->val4= in->val4; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c new file mode 100644 index 000000000000..bb8ea9270f29 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Google */ + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> + +__u64 out__runqueues_addr = -1; +__u64 out__bpf_prog_active_addr = -1; + +__u32 out__rq_cpu = -1; /* percpu struct fields */ +int out__bpf_prog_active = -1; /* percpu int */ + +__u32 out__this_rq_cpu = -1; +int out__this_bpf_prog_active = -1; + +__u32 out__cpu_0_rq_cpu = -1; /* cpu_rq(0)->cpu */ + +extern const struct rq runqueues __ksym; /* struct type global var. */ +extern const int bpf_prog_active __ksym; /* int type global var. */ + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + struct rq *rq; + int *active; + __u32 cpu; + + out__runqueues_addr = (__u64)&runqueues; + out__bpf_prog_active_addr = (__u64)&bpf_prog_active; + + cpu = bpf_get_smp_processor_id(); + + /* test bpf_per_cpu_ptr() */ + rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, cpu); + if (rq) + out__rq_cpu = rq->cpu; + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) + out__bpf_prog_active = *active; + + rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0); + if (rq) /* should always be valid, but we can't spare the check. */ + out__cpu_0_rq_cpu = rq->cpu; + + /* test bpf_this_cpu_ptr */ + rq = (struct rq *)bpf_this_cpu_ptr(&runqueues); + out__this_rq_cpu = rq->cpu; + active = (int *)bpf_this_cpu_ptr(&bpf_prog_active); + out__this_bpf_prog_active = *active; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c index 3a216d1d0226..72ec0178f653 100644 --- a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c @@ -304,10 +304,10 @@ int misc_estab(struct bpf_sock_ops *skops) passive_lport_n = __bpf_htons(passive_lport_h); bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, &true_val, sizeof(true_val)); - set_hdr_cb_flags(skops); + set_hdr_cb_flags(skops, 0); break; case BPF_SOCK_OPS_TCP_CONNECT_CB: - set_hdr_cb_flags(skops); + set_hdr_cb_flags(skops, 0); break; case BPF_SOCK_OPS_PARSE_HDR_OPT_CB: return handle_parse_hdr(skops); diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 3dca4c2e2418..1858435de7aa 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -131,39 +131,55 @@ int bpf_prog2(struct __sk_buff *skb) } -SEC("sk_skb3") -int bpf_prog3(struct __sk_buff *skb) +static inline void bpf_write_pass(struct __sk_buff *skb, int offset) { - const int one = 1; - int err, *f, ret = SK_PASS; + int err = bpf_skb_pull_data(skb, 6 + offset); void *data_end; char *c; - err = bpf_skb_pull_data(skb, 19); if (err) - goto tls_out; + return; c = (char *)(long)skb->data; data_end = (void *)(long)skb->data_end; - if (c + 18 < data_end) - memcpy(&c[13], "PASS", 4); + if (c + 5 + offset < data_end) + memcpy(c + offset, "PASS", 4); +} + +SEC("sk_skb3") +int bpf_prog3(struct __sk_buff *skb) +{ + int err, *f, ret = SK_PASS; + const int one = 1; + f = bpf_map_lookup_elem(&sock_skb_opts, &one); if (f && *f) { __u64 flags = 0; ret = 0; flags = *f; + + err = bpf_skb_adjust_room(skb, -13, 0, 0); + if (err) + return SK_DROP; + err = bpf_skb_adjust_room(skb, 4, 0, 0); + if (err) + return SK_DROP; + bpf_write_pass(skb, 0); #ifdef SOCKMAP return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags); #else return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags); #endif } - f = bpf_map_lookup_elem(&sock_skb_opts, &one); if (f && *f) ret = SK_DROP; + err = bpf_skb_adjust_room(skb, 4, 0, 0); + if (err) + return SK_DROP; + bpf_write_pass(skb, 13); tls_out: return ret; } diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c index 889a72c3024f..fe182616b112 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c +++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c @@ -13,17 +13,10 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#ifndef barrier_data -# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory") -#endif - #ifndef ctx_ptr # define ctx_ptr(field) (void *)(long)(field) #endif -#define dst_to_src_tmp 0xeeddddeeU -#define src_to_dst_tmp 0xeeffffeeU - #define ip4_src 0xac100164 /* 172.16.1.100 */ #define ip4_dst 0xac100264 /* 172.16.2.100 */ @@ -39,6 +32,18 @@ a.s6_addr32[3] == b.s6_addr32[3]) #endif +enum { + dev_src, + dev_dst, +}; + +struct bpf_map_def SEC("maps") ifindex_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 2, +}; + static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb, __be32 addr) { @@ -73,7 +78,14 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb, return v6_equal(ip6h->daddr, addr); } -SEC("chk_neigh") int tc_chk(struct __sk_buff *skb) +static __always_inline int get_dev_ifindex(int which) +{ + int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); + + return ifindex ? *ifindex : 0; +} + +SEC("chk_egress") int tc_chk(struct __sk_buff *skb) { void *data_end = ctx_ptr(skb->data_end); void *data = ctx_ptr(skb->data); @@ -87,7 +99,6 @@ SEC("chk_neigh") int tc_chk(struct __sk_buff *skb) SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) { - int idx = dst_to_src_tmp; __u8 zero[ETH_ALEN * 2]; bool redirect = false; @@ -103,19 +114,15 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) if (!redirect) return TC_ACT_OK; - barrier_data(&idx); - idx = bpf_ntohl(idx); - __builtin_memset(&zero, 0, sizeof(zero)); if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(idx, 0); + return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0); } SEC("src_ingress") int tc_src(struct __sk_buff *skb) { - int idx = src_to_dst_tmp; __u8 zero[ETH_ALEN * 2]; bool redirect = false; @@ -131,14 +138,11 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb) if (!redirect) return TC_ACT_OK; - barrier_data(&idx); - idx = bpf_ntohl(idx); - __builtin_memset(&zero, 0, sizeof(zero)); if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(idx, 0); + return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0); } char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c new file mode 100644 index 000000000000..fc84a7685aa2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdint.h> +#include <stdbool.h> + +#include <linux/bpf.h> +#include <linux/stddef.h> +#include <linux/pkt_cls.h> + +#include <bpf/bpf_helpers.h> + +enum { + dev_src, + dev_dst, +}; + +struct bpf_map_def SEC("maps") ifindex_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 2, +}; + +static __always_inline int get_dev_ifindex(int which) +{ + int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); + + return ifindex ? *ifindex : 0; +} + +SEC("chk_egress") int tc_chk(struct __sk_buff *skb) +{ + return TC_ACT_SHOT; +} + +SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) +{ + return bpf_redirect_peer(get_dev_ifindex(dev_src), 0); +} + +SEC("src_ingress") int tc_src(struct __sk_buff *skb) +{ + return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0); +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c index 9197a23df3da..678bd0fad29e 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c @@ -21,6 +21,7 @@ __u8 test_kind = TCPOPT_EXP; __u16 test_magic = 0xeB9F; +__u32 inherit_cb_flags = 0; struct bpf_test_option passive_synack_out = {}; struct bpf_test_option passive_fin_out = {}; @@ -467,6 +468,8 @@ static int handle_passive_estab(struct bpf_sock_ops *skops) struct tcphdr *th; int err; + inherit_cb_flags = skops->bpf_sock_ops_cb_flags; + err = load_option(skops, &passive_estab_in, true); if (err == -ENOENT) { /* saved_syn is not found. It was in syncookie mode. @@ -600,10 +603,10 @@ int estab(struct bpf_sock_ops *skops) case BPF_SOCK_OPS_TCP_LISTEN_CB: bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, &true_val, sizeof(true_val)); - set_hdr_cb_flags(skops); + set_hdr_cb_flags(skops, BPF_SOCK_OPS_STATE_CB_FLAG); break; case BPF_SOCK_OPS_TCP_CONNECT_CB: - set_hdr_cb_flags(skops); + set_hdr_cb_flags(skops, 0); break; case BPF_SOCK_OPS_PARSE_HDR_OPT_CB: return handle_parse_hdr(skops); diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 9b6fb00dc7a0..0fa1e421c3d7 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -86,6 +86,7 @@ int txmsg_ktls_skb_redir; int ktls; int peek_flag; int skb_use_parser; +int txmsg_omit_skb_parser; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -111,6 +112,7 @@ static const struct option long_options[] = { {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, + {"txmsg_omit_skb_parser", no_argument, &txmsg_omit_skb_parser, 1}, {"whitelist", required_argument, NULL, 'n' }, {"blacklist", required_argument, NULL, 'b' }, {0, 0, NULL, 0 } @@ -175,6 +177,7 @@ static void test_reset(void) txmsg_apply = txmsg_cork = 0; txmsg_ingress = txmsg_redir_skb = 0; txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; + txmsg_omit_skb_parser = 0; skb_use_parser = 0; } @@ -518,28 +521,13 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz) if (i == 0 && txmsg_ktls_skb) { if (msg->msg_iov[i].iov_len < 4) return -EIO; - if (txmsg_ktls_skb_redir) { - if (memcmp(&d[13], "PASS", 4) != 0) { - fprintf(stderr, - "detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]); - return -EIO; - } - d[13] = 0; - d[14] = 1; - d[15] = 2; - d[16] = 3; - j = 13; - } else if (txmsg_ktls_skb) { - if (memcmp(d, "PASS", 4) != 0) { - fprintf(stderr, - "detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]); - return -EIO; - } - d[0] = 0; - d[1] = 1; - d[2] = 2; - d[3] = 3; + if (memcmp(d, "PASS", 4) != 0) { + fprintf(stderr, + "detected skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", + i, 0, d[0], d[1], d[2], d[3]); + return -EIO; } + j = 4; /* advance index past PASS header */ } for (; j < msg->msg_iov[i].iov_len && size; j++) { @@ -927,13 +915,15 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) goto run; /* Attach programs to sockmap */ - err = bpf_prog_attach(prog_fd[0], map_fd[0], - BPF_SK_SKB_STREAM_PARSER, 0); - if (err) { - fprintf(stderr, - "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n", - prog_fd[0], map_fd[0], err, strerror(errno)); - return err; + if (!txmsg_omit_skb_parser) { + err = bpf_prog_attach(prog_fd[0], map_fd[0], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n", + prog_fd[0], map_fd[0], err, strerror(errno)); + return err; + } } err = bpf_prog_attach(prog_fd[1], map_fd[0], @@ -946,13 +936,15 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) /* Attach programs to TLS sockmap */ if (txmsg_ktls_skb) { - err = bpf_prog_attach(prog_fd[0], map_fd[8], - BPF_SK_SKB_STREAM_PARSER, 0); - if (err) { - fprintf(stderr, - "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", - prog_fd[0], map_fd[8], err, strerror(errno)); - return err; + if (!txmsg_omit_skb_parser) { + err = bpf_prog_attach(prog_fd[0], map_fd[8], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", + prog_fd[0], map_fd[8], err, strerror(errno)); + return err; + } } err = bpf_prog_attach(prog_fd[2], map_fd[8], @@ -1480,12 +1472,29 @@ static void test_txmsg_skb(int cgrp, struct sockmap_options *opt) txmsg_ktls_skb_drop = 0; txmsg_ktls_skb_redir = 1; test_exec(cgrp, opt); + txmsg_ktls_skb_redir = 0; + + /* Tests that omit skb_parser */ + txmsg_omit_skb_parser = 1; + ktls = 0; + txmsg_ktls_skb = 0; + test_exec(cgrp, opt); + + txmsg_ktls_skb_drop = 1; + test_exec(cgrp, opt); + txmsg_ktls_skb_drop = 0; + + txmsg_ktls_skb_redir = 1; + test_exec(cgrp, opt); + + ktls = 1; + test_exec(cgrp, opt); + txmsg_omit_skb_parser = 0; opt->data_test = data; ktls = k; } - /* Test cork with hung data. This tests poor usage patterns where * cork can leave data on the ring if user program is buggy and * doesn't flush them somehow. They do take some time however diff --git a/tools/testing/selftests/bpf/test_tc_neigh.sh b/tools/testing/selftests/bpf/test_tc_neigh.sh deleted file mode 100755 index 31d8c3df8b24..000000000000 --- a/tools/testing/selftests/bpf/test_tc_neigh.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link -# between src and dst. The netns fwd has veth links to each src and dst. The -# client is in src and server in dst. The test installs a TC BPF program to each -# host facing veth in fwd which calls into bpf_redirect_peer() to perform the -# neigh addr population and redirect; it also installs a dropper prog on the -# egress side to drop skbs if neigh addrs were not populated. - -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -# check that nc, dd, ping, ping6 and timeout are present -command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v dd >/dev/null 2>&1 || \ - { echo >&2 "dd is not available"; exit 1; } -command -v timeout >/dev/null 2>&1 || \ - { echo >&2 "timeout is not available"; exit 1; } -command -v ping >/dev/null 2>&1 || \ - { echo >&2 "ping is not available"; exit 1; } -command -v ping6 >/dev/null 2>&1 || \ - { echo >&2 "ping6 is not available"; exit 1; } - -readonly GREEN='\033[0;92m' -readonly RED='\033[0;31m' -readonly NC='\033[0m' # No Color - -readonly PING_ARG="-c 3 -w 10 -q" - -readonly TIMEOUT=10 - -readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" -readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" -readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" - -readonly IP4_SRC="172.16.1.100" -readonly IP4_DST="172.16.2.100" - -readonly IP6_SRC="::1:dead:beef:cafe" -readonly IP6_DST="::2:dead:beef:cafe" - -readonly IP4_SLL="169.254.0.1" -readonly IP4_DLL="169.254.0.2" -readonly IP4_NET="169.254.0.0" - -cleanup() -{ - ip netns del ${NS_SRC} - ip netns del ${NS_FWD} - ip netns del ${NS_DST} -} - -trap cleanup EXIT - -set -e - -ip netns add "${NS_SRC}" -ip netns add "${NS_FWD}" -ip netns add "${NS_DST}" - -ip link add veth_src type veth peer name veth_src_fwd -ip link add veth_dst type veth peer name veth_dst_fwd - -ip link set veth_src netns ${NS_SRC} -ip link set veth_src_fwd netns ${NS_FWD} - -ip link set veth_dst netns ${NS_DST} -ip link set veth_dst_fwd netns ${NS_FWD} - -ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src -ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst - -# The fwd netns automatically get a v6 LL address / routes, but also needs v4 -# one in order to start ARP probing. IP4_NET route is added to the endpoints -# so that the ARP processing will reply. - -ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd -ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd - -ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad -ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad - -ip -netns ${NS_SRC} link set dev veth_src up -ip -netns ${NS_FWD} link set dev veth_src_fwd up - -ip -netns ${NS_DST} link set dev veth_dst up -ip -netns ${NS_FWD} link set dev veth_dst_fwd up - -ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global -ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global -ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global - -ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global -ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global - -ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global -ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global -ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global - -ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global -ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global - -fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) -fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) - -ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src -ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst - -ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src -ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst - -veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}') -veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}') - -xxd -p < test_tc_neigh.o | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o -xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o - -ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact -ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress -ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh - -ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact -ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress -ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh - -rm -f test_tc_neigh.x.o test_tc_neigh.y.o - -ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" -ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" - -set +e - -TEST="TCPv4 connectivity test" -ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="TCPv6 connectivity test" -ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="ICMPv4 connectivity test" -ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="ICMPv6 connectivity test" -ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh new file mode 100755 index 000000000000..6d7482562140 --- /dev/null +++ b/tools/testing/selftests/bpf/test_tc_redirect.sh @@ -0,0 +1,204 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link +# between src and dst. The netns fwd has veth links to each src and dst. The +# client is in src and server in dst. The test installs a TC BPF program to each +# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the +# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace +# switch from ingress side; it also installs a checker prog on the egress side +# to drop unexpected traffic. + +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + echo "FAIL" + exit 1 +fi + +# check that needed tools are present +command -v nc >/dev/null 2>&1 || \ + { echo >&2 "nc is not available"; exit 1; } +command -v dd >/dev/null 2>&1 || \ + { echo >&2 "dd is not available"; exit 1; } +command -v timeout >/dev/null 2>&1 || \ + { echo >&2 "timeout is not available"; exit 1; } +command -v ping >/dev/null 2>&1 || \ + { echo >&2 "ping is not available"; exit 1; } +command -v ping6 >/dev/null 2>&1 || \ + { echo >&2 "ping6 is not available"; exit 1; } +command -v perl >/dev/null 2>&1 || \ + { echo >&2 "perl is not available"; exit 1; } +command -v jq >/dev/null 2>&1 || \ + { echo >&2 "jq is not available"; exit 1; } +command -v bpftool >/dev/null 2>&1 || \ + { echo >&2 "bpftool is not available"; exit 1; } + +readonly GREEN='\033[0;92m' +readonly RED='\033[0;31m' +readonly NC='\033[0m' # No Color + +readonly PING_ARG="-c 3 -w 10 -q" + +readonly TIMEOUT=10 + +readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" +readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" +readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" + +readonly IP4_SRC="172.16.1.100" +readonly IP4_DST="172.16.2.100" + +readonly IP6_SRC="::1:dead:beef:cafe" +readonly IP6_DST="::2:dead:beef:cafe" + +readonly IP4_SLL="169.254.0.1" +readonly IP4_DLL="169.254.0.2" +readonly IP4_NET="169.254.0.0" + +netns_cleanup() +{ + ip netns del ${NS_SRC} + ip netns del ${NS_FWD} + ip netns del ${NS_DST} +} + +netns_setup() +{ + ip netns add "${NS_SRC}" + ip netns add "${NS_FWD}" + ip netns add "${NS_DST}" + + ip link add veth_src type veth peer name veth_src_fwd + ip link add veth_dst type veth peer name veth_dst_fwd + + ip link set veth_src netns ${NS_SRC} + ip link set veth_src_fwd netns ${NS_FWD} + + ip link set veth_dst netns ${NS_DST} + ip link set veth_dst_fwd netns ${NS_FWD} + + ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src + ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst + + # The fwd netns automatically get a v6 LL address / routes, but also + # needs v4 one in order to start ARP probing. IP4_NET route is added + # to the endpoints so that the ARP processing will reply. + + ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd + ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd + + ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad + ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad + + ip -netns ${NS_SRC} link set dev veth_src up + ip -netns ${NS_FWD} link set dev veth_src_fwd up + + ip -netns ${NS_DST} link set dev veth_dst up + ip -netns ${NS_FWD} link set dev veth_dst_fwd up + + ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global + ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global + ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global + + ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global + ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global + + ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global + ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global + ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global + + ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global + ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global + + fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) + fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) + + ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src + ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst + + ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src + ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst +} + +netns_test_connectivity() +{ + set +e + + ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" + ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" + + TEST="TCPv4 connectivity test" + ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" + if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 + fi + echo -e "${TEST}: ${GREEN}PASS${NC}" + + TEST="TCPv6 connectivity test" + ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" + if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 + fi + echo -e "${TEST}: ${GREEN}PASS${NC}" + + TEST="ICMPv4 connectivity test" + ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} + if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 + fi + echo -e "${TEST}: ${GREEN}PASS${NC}" + + TEST="ICMPv6 connectivity test" + ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} + if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 + fi + echo -e "${TEST}: ${GREEN}PASS${NC}" + + set -e +} + +hex_mem_str() +{ + perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1 +} + +netns_setup_bpf() +{ + local obj=$1 + + ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress + + ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress + + veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex) + veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex) + + progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]') + for prog in $progs; do + map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]') + if [ ! -z "$map" ]; then + bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src) + bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst) + fi + done +} + +trap netns_cleanup EXIT +set -e + +netns_setup +netns_setup_bpf test_tc_neigh.o +netns_test_connectivity +netns_cleanup +netns_setup +netns_setup_bpf test_tc_peer.o +netns_test_connectivity diff --git a/tools/testing/selftests/bpf/test_tcp_hdr_options.h b/tools/testing/selftests/bpf/test_tcp_hdr_options.h index 78a8cf9eab42..6118e3ab61fc 100644 --- a/tools/testing/selftests/bpf/test_tcp_hdr_options.h +++ b/tools/testing/selftests/bpf/test_tcp_hdr_options.h @@ -110,12 +110,13 @@ static inline void clear_hdr_cb_flags(struct bpf_sock_ops *skops) BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)); } -static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops) +static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops, __u32 extra) { bpf_sock_ops_cb_flags_set(skops, skops->bpf_sock_ops_cb_flags | BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | - BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG | + extra); } static inline void clear_parse_all_hdr_cb_flags(struct bpf_sock_ops *skops) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 4d0e913bbb22..1bbd1d9830c8 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -90,6 +90,33 @@ long ksym_get_addr(const char *name) return 0; } +/* open kallsyms and read symbol addresses on the fly. Without caching all symbols, + * this is faster than load + find. + */ +int kallsyms_find(const char *sym, unsigned long long *addr) +{ + char type, name[500]; + unsigned long long value; + int err = 0; + FILE *f; + + f = fopen("/proc/kallsyms", "r"); + if (!f) + return -EINVAL; + + while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) { + if (strcmp(name, sym) == 0) { + *addr = value; + goto out; + } + } + err = -ENOENT; + +out: + fclose(f); + return err; +} + void read_trace_pipe(void) { int trace_fd; diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 25ef597dd03f..f62fdef9e589 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -12,6 +12,10 @@ struct ksym { int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); + +/* open kallsyms and find addresses on the fly, faster than load + search. */ +int kallsyms_find(const char *sym, unsigned long long *addr); + void read_trace_pipe(void); #endif diff --git a/tools/testing/selftests/bpf/verifier/basic.c b/tools/testing/selftests/bpf/verifier/basic.c index b8d18642653a..de84f0d57082 100644 --- a/tools/testing/selftests/bpf/verifier/basic.c +++ b/tools/testing/selftests/bpf/verifier/basic.c @@ -2,7 +2,7 @@ "empty prog", .insns = { }, - .errstr = "unknown opcode 00", + .errstr = "last insn is not an exit or jmp", .result = REJECT, }, { diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c index 2c5fbe7bcd27..ae72536603fe 100644 --- a/tools/testing/selftests/bpf/verifier/direct_packet_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c @@ -529,7 +529,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "invalid access to packet, off=0 size=8, R5(id=1,off=0,r=0)", + .errstr = "invalid access to packet, off=0 size=8, R5(id=2,off=0,r=0)", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c index 3856dba733e9..f9297900cea6 100644 --- a/tools/testing/selftests/bpf/verifier/ld_imm64.c +++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c @@ -51,14 +51,6 @@ .result = REJECT, }, { - "test5 ld_imm64", - .insns = { - BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), - }, - .errstr = "invalid bpf_ld_imm64 insn", - .result = REJECT, -}, -{ "test6 ld_imm64", .insns = { BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), diff --git a/tools/testing/selftests/bpf/verifier/regalloc.c b/tools/testing/selftests/bpf/verifier/regalloc.c new file mode 100644 index 000000000000..ac71b824f97a --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/regalloc.c @@ -0,0 +1,243 @@ +{ + "regalloc basic", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 4), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "regalloc negative", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 24, 4), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = REJECT, + .errstr = "invalid access to map value, value_size=48 off=48 size=1", + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "regalloc src_reg mark", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 5), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "regalloc src_reg negative", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 22, 5), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = REJECT, + .errstr = "invalid access to map value, value_size=48 off=44 size=8", + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "regalloc and spill", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 7), + /* r0 has upper bound that should propagate into r2 */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */ + BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2), + /* r3 has lower and upper bounds */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "regalloc and spill negative", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 48, 7), + /* r0 has upper bound that should propagate into r2 */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */ + BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2), + /* r3 has lower and upper bounds */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = REJECT, + .errstr = "invalid access to map value, value_size=48 off=48 size=8", + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "regalloc three regs", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 12, 5), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_4), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "regalloc after call", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_8, 20, 4), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_9, 0, 3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_8), + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_9), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "regalloc in callee", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 20, 5), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, |