diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/core/dev.c | 32 | ||||
-rw-r--r-- | net/core/filter.c | 195 | ||||
-rw-r--r-- | net/core/skmsg.c | 3 | ||||
-rw-r--r-- | net/core/xdp.c | 70 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 18 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/udp.c | 7 | ||||
-rw-r--r-- | net/ipv6/af_inet6.c | 18 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 | ||||
-rw-r--r-- | net/ipv6/udp.c | 7 | ||||
-rw-r--r-- | net/socket.c | 3 | ||||
-rw-r--r-- | net/xdp/xsk.c | 47 | ||||
-rw-r--r-- | net/xdp/xsk_buff_pool.c | 12 |
14 files changed, 340 insertions, 88 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index ea9b46318d23..6c5967e80132 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2217,28 +2217,14 @@ static inline void net_timestamp_set(struct sk_buff *skb) bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { - unsigned int len; - - if (!(dev->flags & IFF_UP)) - return false; - - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len <= len) - return true; - - /* if TSO is enabled, we don't care about the length as the packet - * could be forwarded without being segmented before - */ - if (skb_is_gso(skb)) - return true; - - return false; + return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); -int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); @@ -2247,6 +2233,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return ret; } + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); +} EXPORT_SYMBOL_GPL(__dev_forward_skb); /** @@ -2273,6 +2264,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_forward_skb); +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) diff --git a/net/core/filter.c b/net/core/filter.c index 3b728ab79a61..adfdad234674 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2083,13 +2083,13 @@ static const struct bpf_func_proto bpf_csum_level_proto = { static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { - return dev_forward_skb(dev, skb); + return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; @@ -2480,7 +2480,7 @@ int skb_do_redirect(struct sk_buff *skb) goto out_drop; dev = ops->ndo_get_peer_dev(dev); if (unlikely(!dev || - !is_skb_forwardable(dev, skb) || + !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; @@ -3552,11 +3552,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, return 0; } -static u32 __bpf_skb_max_len(const struct sk_buff *skb) -{ - return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : - SKB_MAX_ALLOC; -} +#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) @@ -3605,7 +3601,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); - u32 len_max = __bpf_skb_max_len(skb); + u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; @@ -3688,7 +3684,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { - u32 max_len = __bpf_skb_max_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; @@ -3764,7 +3760,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = { static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { - u32 max_len = __bpf_skb_max_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; @@ -4631,6 +4627,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); @@ -5291,12 +5299,14 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, const struct neighbour *neigh, - const struct net_device *dev) + const struct net_device *dev, u32 mtu) { memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + if (mtu) + params->mtu_result = mtu; /* union with tot_len */ return 0; } @@ -5312,8 +5322,8 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct net_device *dev; struct fib_result res; struct flowi4 fl4; + u32 mtu = 0; int err; - u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -5380,8 +5390,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } nhc = res.nhc; @@ -5415,7 +5427,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5432,7 +5444,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif, err; - u32 mtu; + u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -5507,8 +5519,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } if (res.nh->fib_nh_lws) @@ -5528,7 +5542,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5571,6 +5585,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; + bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; @@ -5578,25 +5593,33 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) return -EINVAL; + if (params->tot_len) + check_mtu = true; + switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - rc = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - rc = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } - if (!rc) { + if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; + /* When tot_len isn't provided by user, check skb + * against MTU of FIB lookup resulting net_device + */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; + + params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; @@ -5612,6 +5635,116 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, + u32 ifindex) +{ + struct net *netns = dev_net(dev_curr); + + /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ + if (ifindex == 0) + return dev_curr; + + return dev_get_by_index_rcu(netns, ifindex); +} + +BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + struct net_device *dev = skb->dev; + int skb_len, dev_len; + int mtu; + + if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) + return -EINVAL; + + if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + dev_len = mtu + dev->hard_header_len; + skb_len = skb->len + len_diff; /* minus result pass check */ + if (skb_len <= dev_len) { + ret = BPF_MTU_CHK_RET_SUCCESS; + goto out; + } + /* At this point, skb->len exceed MTU, but as it include length of all + * segments, it can still be below MTU. The SKB can possibly get + * re-segmented in transmit path (see validate_xmit_skb). Thus, user + * must choose if segs are to be MTU checked. + */ + if (skb_is_gso(skb)) { + ret = BPF_MTU_CHK_RET_SUCCESS; + + if (flags & BPF_MTU_CHK_SEGS && + !skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } +out: + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + struct net_device *dev = xdp->rxq->dev; + int xdp_len = xdp->data_end - xdp->data; + int ret = BPF_MTU_CHK_RET_SUCCESS; + int mtu, dev_len; + + /* XDP variant doesn't support multi-buffer segment check (yet) */ + if (unlikely(flags)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + /* Add L2-header as dev MTU is L3 size */ + dev_len = mtu + dev->hard_header_len; + + xdp_len += len_diff; /* minus result pass check */ + if (xdp_len > dev_len) + ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +static const struct bpf_func_proto bpf_skb_check_mtu_proto = { + .func = bpf_skb_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { + .func = bpf_xdp_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { @@ -7021,6 +7154,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; @@ -7031,6 +7172,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; @@ -7181,6 +7330,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -7250,6 +7401,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 25cdbb20f3a0..1261512d6807 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -669,14 +669,13 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -void sk_psock_destroy(struct rcu_head *rcu) +static void sk_psock_destroy(struct rcu_head *rcu) { struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); INIT_WORK(&psock->gc, sk_psock_destroy_deferred); schedule_work(&psock->gc); } -EXPORT_SYMBOL_GPL(sk_psock_destroy); void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 3a8c9ab4ecbe..05354976c1fc 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -513,3 +513,73 @@ void xdp_warn(const char *msg, const char *func, const int line) WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); }; EXPORT_SYMBOL_GPL(xdp_warn); + +int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +{ + n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, + n_skb, skbs); + if (unlikely(!n_skb)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); + +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev) +{ + unsigned int headroom, frame_size; + void *hard_start; + + /* Part of headroom was reserved to xdpf */ + headroom = sizeof(*xdpf) + xdpf->headroom; + + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. + */ + frame_size = xdpf->frame_sz; + + hard_start = xdpf->data - headroom; + skb = build_skb_around(skb, hard_start, frame_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, dev); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + + return skb; +} +EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); + +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + + return __xdp_build_skb_from_frame(xdpf, skb, dev); +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2ff5d8058ce6..a02ce89b56b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -438,6 +438,7 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -450,11 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET4_BIND, &flags); if (err) return err; - return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet_bind); @@ -499,7 +501,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; @@ -777,18 +780,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET4_GETPEERNAME, + NULL); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET4_GETPEERNAME : - BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET4_GETSOCKNAME, NULL); + } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7a6b58ae408d..a3422e42784e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4162,6 +4162,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EINVAL; lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; @@ -4208,6 +4210,18 @@ zerocopy_rcv_out: return 0; } +bool tcp_bpf_bypass_getsockopt(int level, int optname) +{ + /* TCP do_tcp_getsockopt has optimized getsockopt implementation + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. + */ + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) + return true; + + return false; +} +EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); + int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 611039207d30..daad4f99db32 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2796,6 +2796,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 48208fb4e895..4a0478b17243 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1130,7 +1130,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1864,9 +1864,8 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0e9994e0ecd7..1fb75f01756c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -295,7 +295,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -439,6 +440,7 @@ out_unlock: int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err = 0; /* If the socket has its own bind function then use it. */ @@ -451,11 +453,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET6_BIND, &flags); if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet6_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet6_bind); @@ -527,18 +530,19 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET6_GETPEERNAME, + NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET6_GETPEERNAME : - BPF_CGROUP_INET6_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, NULL); + } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d093ef3ef060..bd44ded7e50c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2124,6 +2124,7 @@ struct proto tcpv6_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d75429205084..d25e5a9252fd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -409,9 +409,8 @@ try_again: } *addr_len = sizeof(*sin6); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) @@ -1462,7 +1461,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/net/socket.c b/net/socket.c index 33e8b6c4e1d3..7f0617ab5437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return __sys_setsockopt(fd, level, optname, optval, optlen); } +INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, + int optname)); + /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4a83117507f5..4faabd1ecfd1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -184,12 +184,13 @@ static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, - bool explicit_free) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { struct xdp_buff *xsk_xdp; int err; + u32 len; + len = xdp->data_end - xdp->data; if (len > xsk_pool_get_rx_frame_size(xs->pool)) { xs->rx_dropped++; return -ENOSPC; @@ -207,8 +208,6 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, xsk_buff_free(xsk_xdp); return err; } - if (explicit_free) - xdp_return_buff(xdp); return 0; } @@ -230,11 +229,8 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, - bool explicit_free) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 len; - if (!xsk_is_bound(xs)) return -EINVAL; @@ -242,11 +238,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, return -EINVAL; sk_mark_napi_id_once_xdp(&xs->sk, xdp); - len = xdp->data_end - xdp->data; - - return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? - __xsk_rcv_zc(xs, xdp, len) : - __xsk_rcv(xs, xdp, len, explicit_free); + return 0; } static void xsk_flush(struct xdp_sock *xs) @@ -261,18 +253,41 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) int err; spin_lock_bh(&xs->rx_lock); - err = xsk_rcv(xs, xdp, false); - xsk_flush(xs); + err = xsk_rcv_check(xs, xdp); + if (!err) { + err = __xsk_rcv(xs, xdp); + xsk_flush(xs); + } spin_unlock_bh(&xs->rx_lock); return err; } +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + u32 len; + + err = xsk_rcv_check(xs, xdp); + if (err) + return err; + + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { + len = xdp->data_end - xdp->data; + return __xsk_rcv_zc(xs, xdp, len); + } + + err = __xsk_rcv(xs, xdp); + if (!err) + xdp_return_buff(xdp); + return err; +} + int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp, true); + err = xsk_rcv(xs, xdp); if (err) return err; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 20598eea658c..8de01aaac4a0 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -119,8 +119,8 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) } } -static int __xp_assign_dev(struct xsk_buff_pool *pool, - struct net_device *netdev, u16 queue_id, u16 flags) +int xp_assign_dev(struct xsk_buff_pool *pool, + struct net_device *netdev, u16 queue_id, u16 flags) { bool force_zc, force_copy; struct netdev_bpf bpf; @@ -191,12 +191,6 @@ err_unreg_pool: return err; } -int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, - u16 queue_id, u16 flags) -{ - return __xp_assign_dev(pool, dev, queue_id, flags); -} - int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, struct net_device *dev, u16 queue_id) { @@ -210,7 +204,7 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, if (pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; - return __xp_assign_dev(pool, dev, queue_id, flags); + return xp_assign_dev(pool, dev, queue_id, flags); } void xp_clear_dev(struct xsk_buff_pool *pool) |