summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c334
1 files changed, 114 insertions, 220 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8d20d9221238..e03e08745308 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -599,7 +599,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
EXPORT_SYMBOL(tcp_poll);
-int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+int tcp_ioctl(struct sock *sk, int cmd, int *karg)
{
struct tcp_sock *tp = tcp_sk(sk);
int answ;
@@ -641,7 +641,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -ENOIOCTLCMD;
}
- return put_user(answ, (int __user *)arg);
+ *karg = answ;
+ return 0;
}
EXPORT_SYMBOL(tcp_ioctl);
@@ -838,7 +839,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
tss.len -= ret;
spliced += ret;
- if (!timeo)
+ if (!tss.len || !timeo)
break;
release_sock(sk);
lock_sock(sk);
@@ -858,12 +859,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_SYMBOL(tcp_splice_read);
-struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
bool force_schedule)
{
struct sk_buff *skb;
- skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
if (likely(skb)) {
bool mem_scheduled;
@@ -922,11 +923,10 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
-/* In some cases, both sendpage() and sendmsg() could have added
- * an skb to the write queue, but failed adding payload on it.
- * We need to remove it to consume less memory, but more
- * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
- * users.
+/* In some cases, both sendmsg() could have added an skb to the write queue,
+ * but failed adding payload on it. We need to remove it to consume less
+ * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
+ * epoll() users.
*/
void tcp_remove_empty_skb(struct sock *sk)
{
@@ -957,7 +957,7 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
}
-static int tcp_wmem_schedule(struct sock *sk, int copy)
+int tcp_wmem_schedule(struct sock *sk, int copy)
{
int left;
@@ -974,191 +974,6 @@ static int tcp_wmem_schedule(struct sock *sk, int copy)
return min(copy, sk->sk_forward_alloc);
}
-static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
- struct page *page, int offset, size_t *size)
-{
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- bool can_coalesce;
- int copy, i;
-
- if (!skb || (copy = size_goal - skb->len) <= 0 ||
- !tcp_skb_can_collapse_to(skb)) {
-new_segment:
- if (!sk_stream_memory_free(sk))
- return NULL;
-
- skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
- tcp_rtx_and_write_queues_empty(sk));
- if (!skb)
- return NULL;
-
-#ifdef CONFIG_TLS_DEVICE
- skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
-#endif
- tcp_skb_entail(sk, skb);
- copy = size_goal;
- }
-
- if (copy > *size)
- copy = *size;
-
- i = skb_shinfo(skb)->nr_frags;
- can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
- tcp_mark_push(tp, skb);
- goto new_segment;
- }
- if (tcp_downgrade_zcopy_pure(sk, skb))
- return NULL;
-
- copy = tcp_wmem_schedule(sk, copy);
- if (!copy)
- return NULL;
-
- if (can_coalesce) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
- } else {
- get_page(page);
- skb_fill_page_desc_noacc(skb, i, page, offset, copy);
- }
-
- if (!(flags & MSG_NO_SHARED_FRAGS))
- skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
-
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
- sk_wmem_queued_add(sk, copy);
- sk_mem_charge(sk, copy);
- WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
- TCP_SKB_CB(skb)->end_seq += copy;
- tcp_skb_pcount_set(skb, 0);
-
- *size = copy;
- return skb;
-}
-
-ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int mss_now, size_goal;
- int err;
- ssize_t copied;
- long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-
- if (IS_ENABLED(CONFIG_DEBUG_VM) &&
- WARN_ONCE(!sendpage_ok(page),
- "page must not be a Slab one and have page_count > 0"))
- return -EINVAL;
-
- /* Wait for a connection to finish. One exception is TCP Fast Open
- * (passive side) where data is allowed to be sent before a connection
- * is fully established.
- */
- if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
- !tcp_passive_fastopen(sk)) {
- err = sk_stream_wait_connect(sk, &timeo);
- if (err != 0)
- goto out_err;
- }
-
- sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-
- mss_now = tcp_send_mss(sk, &size_goal, flags);
- copied = 0;
-
- err = -EPIPE;
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto out_err;
-
- while (size > 0) {
- struct sk_buff *skb;
- size_t copy = size;
-
- skb = tcp_build_frag(sk, size_goal, flags, page, offset, &copy);
- if (!skb)
- goto wait_for_space;
-
- if (!copied)
- TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
-
- copied += copy;
- offset += copy;
- size -= copy;
- if (!size)
- goto out;
-
- if (skb->len < size_goal || (flags & MSG_OOB))
- continue;
-
- if (forced_push(tp)) {
- tcp_mark_push(tp, skb);
- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
- } else if (skb == tcp_send_head(sk))
- tcp_push_one(sk, mss_now);
- continue;
-
-wait_for_space:
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- tcp_push(sk, flags & ~MSG_MORE, mss_now,
- TCP_NAGLE_PUSH, size_goal);
-
- err = sk_stream_wait_memory(sk, &timeo);
- if (err != 0)
- goto do_error;
-
- mss_now = tcp_send_mss(sk, &size_goal, flags);
- }
-
-out:
- if (copied) {
- tcp_tx_timestamp(sk, sk->sk_tsflags);
- if (!(flags & MSG_SENDPAGE_NOTLAST))
- tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
- }
- return copied;
-
-do_error:
- tcp_remove_empty_skb(sk);
- if (copied)
- goto out;
-out_err:
- /* make sure we wake any epoll edge trigger waiter */
- if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
- sk->sk_write_space(sk);
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
- }
- return sk_stream_error(sk, flags, err);
-}
-EXPORT_SYMBOL_GPL(do_tcp_sendpages);
-
-int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
-{
- if (!(sk->sk_route_caps & NETIF_F_SG))
- return sock_no_sendpage_locked(sk, page, offset, size, flags);
-
- tcp_rate_check_app_limited(sk); /* is sending application-limited? */
-
- return do_tcp_sendpages(sk, page, offset, size, flags);
-}
-EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
-
-int tcp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
-{
- int ret;
-
- lock_sock(sk);
- ret = tcp_sendpage_locked(sk, page, offset, size, flags);
- release_sock(sk);
-
- return ret;
-}
-EXPORT_SYMBOL(tcp_sendpage);
-
void tcp_free_fastopen_req(struct tcp_sock *tp)
{
if (tp->fastopen_req) {
@@ -1223,28 +1038,31 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
int process_backlog = 0;
- bool zc = false;
+ int zc = 0;
long timeo;
flags = msg->msg_flags;
if ((flags & MSG_ZEROCOPY) && size) {
- skb = tcp_write_queue_tail(sk);
-
if (msg->msg_ubuf) {
uarg = msg->msg_ubuf;
- net_zcopy_get(uarg);
- zc = sk->sk_route_caps & NETIF_F_SG;
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_ZEROCOPY;
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ skb = tcp_write_queue_tail(sk);
uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
goto out_err;
}
- zc = sk->sk_route_caps & NETIF_F_SG;
- if (!zc)
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_ZEROCOPY;
+ else
uarg_to_msgzc(uarg)->zerocopy = 0;
}
+ } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_SPLICE_PAGES;
}
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
@@ -1307,7 +1125,7 @@ restart:
goto do_error;
while (msg_data_left(msg)) {
- int copy = 0;
+ ssize_t copy = 0;
skb = tcp_write_queue_tail(sk);
if (skb)
@@ -1326,7 +1144,7 @@ new_segment:
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
- skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
first_skb);
if (!skb)
goto wait_for_space;
@@ -1348,7 +1166,7 @@ new_segment:
if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
- if (!zc) {
+ if (zc == 0) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1393,7 +1211,7 @@ new_segment:
page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
- } else {
+ } else if (zc == MSG_ZEROCOPY) {
/* First append to a fragless skb builds initial
* pure zerocopy skb
*/
@@ -1414,6 +1232,30 @@ new_segment:
if (err < 0)
goto do_error;
copy = err;
+ } else if (zc == MSG_SPLICE_PAGES) {
+ /* Splice in data if we can; copy if we can't. */
+ if (tcp_downgrade_zcopy_pure(sk, skb))
+ goto wait_for_space;
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
+ goto wait_for_space;
+
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
+ sk->sk_allocation);
+ if (err < 0) {
+ if (err == -EMSGSIZE) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ goto do_error;
+ }
+ copy = err;
+
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
+
+ sk_wmem_queued_add(sk, copy);
+ sk_mem_charge(sk, copy);
}
if (!copied)
@@ -1459,7 +1301,9 @@ out:
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
- net_zcopy_put(uarg);
+ /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
+ if (uarg && !msg->msg_ubuf)
+ net_zcopy_put(uarg);
return copied + copied_syn;
do_error:
@@ -1468,7 +1312,9 @@ do_error:
if (copied + copied_syn)
goto out;
out_err:
- net_zcopy_put_abort(uarg, true);
+ /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
+ if (uarg && !msg->msg_ubuf)
+ net_zcopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
@@ -1491,6 +1337,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
EXPORT_SYMBOL(tcp_sendmsg);
+void tcp_splice_eof(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now, size_goal;
+
+ if (!tcp_write_queue_tail(sk))
+ return;
+
+ lock_sock(sk);
+ mss_now = tcp_send_mss(sk, &size_goal, 0);
+ tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
+ release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_splice_eof);
+
/*
* Handle reading urgent data. BSD has very simple semantics for
* this, no blocking and very strange errors 8)
@@ -1877,7 +1739,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb,
}
#ifdef CONFIG_MMU
-static const struct vm_operations_struct tcp_vm_ops = {
+const struct vm_operations_struct tcp_vm_ops = {
};
int tcp_mmap(struct file *file, struct socket *sock,
@@ -2176,6 +2038,34 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
}
}
+static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
+ unsigned long address,
+ bool *mmap_locked)
+{
+ struct vm_area_struct *vma = NULL;
+
+#ifdef CONFIG_PER_VMA_LOCK
+ vma = lock_vma_under_rcu(mm, address);
+#endif
+ if (vma) {
+ if (!vma_is_tcp(vma)) {
+ vma_end_read(vma);
+ return NULL;
+ }
+ *mmap_locked = false;
+ return vma;
+ }
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, address);
+ if (!vma || !vma_is_tcp(vma)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+ *mmap_locked = true;
+ return vma;
+}
+
#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc,
@@ -2193,6 +2083,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
u32 seq = tp->copied_seq;
u32 total_bytes_to_map;
int inq = tcp_inq(sk);
+ bool mmap_locked;
int ret;
zc->copybuf_len = 0;
@@ -2217,13 +2108,10 @@ static int tcp_zerocopy_receive(struct sock *sk,
return 0;
}
- mmap_read_lock(current->mm);
-
- vma = vma_lookup(current->mm, address);
- if (!vma || vma->vm_ops != &tcp_vm_ops) {
- mmap_read_unlock(current->mm);
+ vma = find_tcp_vma(current->mm, address, &mmap_locked);
+ if (!vma)
return -EINVAL;
- }
+
vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
avail_len = min_t(u32, vma_len, inq);
total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
@@ -2297,7 +2185,10 @@ static int tcp_zerocopy_receive(struct sock *sk,
zc, total_bytes_to_map);
}
out:
- mmap_read_unlock(current->mm);
+ if (mmap_locked)
+ mmap_read_unlock(current->mm);
+ else
+ vma_end_read(vma);
/* Try to copy straggler data. */
if (!ret)
copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
@@ -4680,8 +4571,10 @@ int tcp_abort(struct sock *sk, int err)
return 0;
}
- /* Don't race with userspace socket closes such as tcp_close. */
- lock_sock(sk);
+ /* BPF context ensures sock locking. */
+ if (!has_current_bpf_ctx())
+ /* Don't race with userspace socket closes such as tcp_close. */
+ lock_sock(sk);
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
@@ -4705,7 +4598,8 @@ int tcp_abort(struct sock *sk, int err)
bh_unlock_sock(sk);
local_bh_enable();
tcp_write_queue_purge(sk);
- release_sock(sk);
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
return 0;
}
EXPORT_SYMBOL_GPL(tcp_abort);