Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Alexei Starovoitov says: ==================== 1) libbpf should not attempt to load unused subprogs, from Andrii. 2) Make strncpy_from_user() mask out bytes after NUL terminator, from Daniel. 3) Relax return code check for subprograms in the BPF verifier, from Dmitrii. 4) Fix several sockmap issues, from John. * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf: fail_function: Remove a redundant mutex unlock selftest/bpf: Test bpf_probe_read_user_str() strips trailing bytes after NUL lib/strncpy_from_user.c: Mask out bytes after NUL terminator. libbpf: Fix VERSIONED_SYM_COUNT number parsing bpf, sockmap: Avoid failures from skb_to_sgvec when skb has frag_list bpf, sockmap: Handle memory acct if skb_verdict prog redirects to self bpf, sockmap: Avoid returning unneeded EAGAIN when redirecting to self bpf, sockmap: Use truesize with sk_rmem_schedule() bpf, sockmap: Ensure SO_RCVBUF memory is observed on ingress redirect bpf, sockmap: Fix partial copy_page_to_iter so progress can still be made selftests/bpf: Fix error return code in run_getsockopt_test() bpf: Relax return code check for subprograms tools, bpftool: Add missing close before bpftool net attach exit MAINTAINERS/bpf: Update Andrii's entry. selftests/bpf: Fix unused attribute usage in subprogs_unused test bpf: Fix unsigned 'datasec_id' compared with zero in check_pseudo_btf_id bpf: Fix passing zero to PTR_ERR() in bpf_btf_printf_prepare libbpf: Don't attempt to load unused subprog as an entry-point BPF program ==================== Link: https://lore.kernel.org/r/20201119200721.288-1-alexei.starovoitov@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Jakub Kicinski <kuba@kernel.org> 2020-11-19 21:26:09 +0100
committer: Jakub Kicinski <kuba@kernel.org> 2020-11-19 21:26:10 +0100
commit: e6ea60bac1ee28bb46232f8c2ecd3a3fbb9011e0 (patch)
tree: f119a3b701852f184b6131405470de0f3969c783 /net
parent: Merge branch 'net-smc-fixes-2020-11-18' (diff)
parent: fail_function: Remove a redundant mutex unlock (diff)
download: linux-e6ea60bac1ee28bb46232f8c2ecd3a3fbb9011e0.tar.xz
linux-e6ea60bac1ee28bb46232f8c2ecd3a3fbb9011e0.zip
2 files changed, 85 insertions, 20 deletions
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 654182ecf87b..25cdbb20f3a0 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -170,10 +170,12 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
 	struct scatterlist *sge = sk_msg_elem(msg, i);
 	u32 len = sge->length;
 
-	if (charge)
-		sk_mem_uncharge(sk, len);
-	if (!msg->skb)
+	/* When the skb owns the memory we free it from consume_skb path. */
+	if (!msg->skb) {
+		if (charge)
+			sk_mem_uncharge(sk, len);
 		put_page(sg_page(sge));
+	}
 	memset(sge, 0, sizeof(*sge));
 	return len;
 }
@@ -397,28 +399,45 @@ out:
 }
 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
 
-static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
+						  struct sk_buff *skb)
 {
-	struct sock *sk = psock->sk;
-	int copied = 0, num_sge;
 	struct sk_msg *msg;
 
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+		return NULL;
+
+	if (!sk_rmem_schedule(sk, skb, skb->truesize))
+		return NULL;
+
 	msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
 	if (unlikely(!msg))
-		return -EAGAIN;
-	if (!sk_rmem_schedule(sk, skb, skb->len)) {
-		kfree(msg);
-		return -EAGAIN;
-	}
+		return NULL;
 
 	sk_msg_init(msg);
+	return msg;
+}
+
+static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
+					struct sk_psock *psock,
+					struct sock *sk,
+					struct sk_msg *msg)
+{
+	int num_sge, copied;
+
+	/* skb linearize may fail with ENOMEM, but lets simply try again
+	 * later if this happens. Under memory pressure we don't want to
+	 * drop the skb. We need to linearize the skb so that the mapping
+	 * in skb_to_sgvec can not error.
+	 */
+	if (skb_linearize(skb))
+		return -EAGAIN;
 	num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
 	if (unlikely(num_sge < 0)) {
 		kfree(msg);
 		return num_sge;
 	}
 
-	sk_mem_charge(sk, skb->len);
 	copied = skb->len;
 	msg->sg.start = 0;
 	msg->sg.size = copied;
@@ -430,6 +449,48 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 	return copied;
 }
 
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+	struct sock *sk = psock->sk;
+	struct sk_msg *msg;
+
+	/* If we are receiving on the same sock skb->sk is already assigned,
+	 * skip memory accounting and owner transition seeing it already set
+	 * correctly.
+	 */
+	if (unlikely(skb->sk == sk))
+		return sk_psock_skb_ingress_self(psock, skb);
+	msg = sk_psock_create_ingress_msg(sk, skb);
+	if (!msg)
+		return -EAGAIN;
+
+	/* This will transition ownership of the data from the socket where
+	 * the BPF program was run initiating the redirect to the socket
+	 * we will eventually receive this data on. The data will be released
+	 * from skb_consume found in __tcp_bpf_recvmsg() after its been copied
+	 * into user buffers.
+	 */
+	skb_set_owner_r(skb, sk);
+	return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+}
+
+/* Puts an skb on the ingress queue of the socket already assigned to the
+ * skb. In this case we do not need to check memory limits or skb_set_owner_r
+ * because the skb is already accounted for here.
+ */
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb)
+{
+	struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+	struct sock *sk = psock->sk;
+
+	if (unlikely(!msg))
+		return -EAGAIN;
+	sk_msg_init(msg);
+	return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+}
+
 static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 			       u32 off, u32 len, bool ingress)
 {
@@ -789,7 +850,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 		 * retrying later from workqueue.
 		 */
 		if (skb_queue_empty(&psock->ingress_skb)) {
-			err = sk_psock_skb_ingress(psock, skb);
+			err = sk_psock_skb_ingress_self(psock, skb);
 		}
 		if (err < 0) {
 			skb_queue_tail(&psock->ingress_skb, skb);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 37f4cb2bba5c..bc7d2a586e18 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -15,8 +15,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 {
 	struct iov_iter *iter = &msg->msg_iter;
 	int peek = flags & MSG_PEEK;
-	int i, ret, copied = 0;
 	struct sk_msg *msg_rx;
+	int i, copied = 0;
 
 	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
 					  struct sk_msg, list);
@@ -37,17 +37,16 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 			page = sg_page(sge);
 			if (copied + copy > len)
 				copy = len - copied;
-			ret = copy_page_to_iter(page, sge->offset, copy, iter);
-			if (ret != copy) {
-				msg_rx->sg.start = i;
-				return -EFAULT;
-			}
+			copy = copy_page_to_iter(page, sge->offset, copy, iter);
+			if (!copy)
+				return copied ? copied : -EFAULT;
 
 			copied += copy;
 			if (likely(!peek)) {
 				sge->offset += copy;
 				sge->length -= copy;
-				sk_mem_uncharge(sk, copy);
+				if (!msg_rx->skb)
+					sk_mem_uncharge(sk, copy);
 				msg_rx->sg.size -= copy;
 
 				if (!sge->length) {
@@ -56,6 +55,11 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 						put_page(page);
 				}
 			} else {
+				/* Lets not optimize peek case if copy_page_to_iter
+				 * didn't copy the entire length lets just break.
+				 */
+				if (copy != sge->length)
+					return copied;
 				sk_msg_iter_var_next(i);
 			}
author	Jakub Kicinski <kuba@kernel.org>	2020-11-19 21:26:09 +0100
committer	Jakub Kicinski <kuba@kernel.org>	2020-11-19 21:26:10 +0100
commit	e6ea60bac1ee28bb46232f8c2ecd3a3fbb9011e0 (patch)
tree	f119a3b701852f184b6131405470de0f3969c783 /net
parent	Merge branch 'net-smc-fixes-2020-11-18' (diff)
parent	fail_function: Remove a redundant mutex unlock (diff)
download	linux-e6ea60bac1ee28bb46232f8c2ecd3a3fbb9011e0.tar.xz linux-e6ea60bac1ee28bb46232f8c2ecd3a3fbb9011e0.zip