diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 5 | ||||
-rw-r--r-- | kernel/bpf/core.c | 2 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 2610 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 6 |
4 files changed, 3 insertions, 2620 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0488b8258321..ff8262626b8f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y) obj-$(CONFIG_BPF_SYSCALL) += xskmap.o endif obj-$(CONFIG_BPF_SYSCALL) += offload.o -ifeq ($(CONFIG_STREAM_PARSER),y) -ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_BPF_SYSCALL) += sockmap.o -endif -endif endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3f5bf1af0826..defcf4df6d91 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; -const struct bpf_func_proto bpf_sock_map_update_proto __weak; -const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c deleted file mode 100644 index de6f7a65c72b..000000000000 --- a/kernel/bpf/sockmap.c +++ /dev/null @@ -1,2610 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -/* A BPF sock_map is used to store sock objects. This is primarly used - * for doing socket redirect with BPF helper routines. - * - * A sock map may have BPF programs attached to it, currently a program - * used to parse packets and a program to provide a verdict and redirect - * decision on the packet are supported. Any programs attached to a sock - * map are inherited by sock objects when they are added to the map. If - * no BPF programs are attached the sock object may only be used for sock - * redirect. - * - * A sock object may be in multiple maps, but can only inherit a single - * parse or verdict program. If adding a sock object to a map would result - * in having multiple parsing programs the update will return an EBUSY error. - * - * For reference this program is similar to devmap used in XDP context - * reviewing these together may be useful. For an example please review - * ./samples/bpf/sockmap/. - */ -#include <linux/bpf.h> -#include <net/sock.h> -#include <linux/filter.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/kernel.h> -#include <linux/net.h> -#include <linux/skbuff.h> -#include <linux/workqueue.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <net/strparser.h> -#include <net/tcp.h> -#include <linux/ptr_ring.h> -#include <net/inet_common.h> -#include <linux/sched/signal.h> - -#define SOCK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -struct bpf_sock_progs { - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; -}; - -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; - struct bpf_sock_progs progs; - raw_spinlock_t lock; -}; - -struct bucket { - struct hlist_head head; - raw_spinlock_t lock; -}; - -struct bpf_htab { - struct bpf_map map; - struct bucket *buckets; - atomic_t count; - u32 n_buckets; - u32 elem_size; - struct bpf_sock_progs progs; - struct rcu_head rcu; -}; - -struct htab_elem { - struct rcu_head rcu; - struct hlist_node hash_node; - u32 hash; - struct sock *sk; - char key[0]; -}; - -enum smap_psock_state { - SMAP_TX_RUNNING, -}; - -struct smap_psock_map_entry { - struct list_head list; - struct bpf_map *map; - struct sock **entry; - struct htab_elem __rcu *hash_link; -}; - -struct smap_psock { - struct rcu_head rcu; - refcount_t refcnt; - - /* datapath variables */ - struct sk_buff_head rxqueue; - bool strp_enabled; - - /* datapath error path cache across tx work invocations */ - int save_rem; - int save_off; - struct sk_buff *save_skb; - - /* datapath variables for tx_msg ULP */ - struct sock *sk_redir; - int apply_bytes; - int cork_bytes; - int sg_size; - int eval; - struct sk_msg_buff *cork; - struct list_head ingress; - - struct strparser strp; - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; - struct list_head maps; - spinlock_t maps_lock; - - /* Back reference used when sock callback trigger sockmap operations */ - struct sock *sock; - unsigned long state; - - struct work_struct tx_work; - struct work_struct gc_work; - - struct proto *sk_proto; - void (*save_unhash)(struct sock *sk); - void (*save_close)(struct sock *sk, long timeout); - void (*save_data_ready)(struct sock *sk); - void (*save_write_space)(struct sock *sk); -}; - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -static void bpf_tcp_unhash(struct sock *sk); -static void bpf_tcp_close(struct sock *sk, long timeout); - -static inline struct smap_psock *smap_psock_sk(const struct sock *sk) -{ - return rcu_dereference_sk_user_data(sk); -} - -static bool bpf_tcp_stream_read(const struct sock *sk) -{ - struct smap_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - empty = list_empty(&psock->ingress); -out: - rcu_read_unlock(); - return !empty; -} - -enum { - SOCKMAP_IPV4, - SOCKMAP_IPV6, - SOCKMAP_NUM_PROTS, -}; - -enum { - SOCKMAP_BASE, - SOCKMAP_TX, - SOCKMAP_NUM_CONFIGS, -}; - -static struct proto *saved_tcpv6_prot __read_mostly; -static DEFINE_SPINLOCK(tcpv6_prot_lock); -static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; - -static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], - struct proto *base) -{ - prot[SOCKMAP_BASE] = *base; - prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; - prot[SOCKMAP_BASE].close = bpf_tcp_close; - prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; - prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; - - prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; - prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; - prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; -} - -static void update_sk_prot(struct sock *sk, struct smap_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; - int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; - - sk->sk_prot = &bpf_tcp_prots[family][conf]; -} - -static int bpf_tcp_init(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return -EINVAL; - } - - if (unlikely(psock->sk_proto)) { - rcu_read_unlock(); - return -EBUSY; - } - - psock->save_unhash = sk->sk_prot->unhash; - psock->save_close = sk->sk_prot->close; - psock->sk_proto = sk->sk_prot; - - /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ - if (sk->sk_family == AF_INET6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { - spin_lock_bh(&tcpv6_prot_lock); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); - } - spin_unlock_bh(&tcpv6_prot_lock); - } - update_sk_prot(sk, psock); - rcu_read_unlock(); - return 0; -} - -static int __init bpf_sock_init(void) -{ - build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); - return 0; -} -core_initcall(bpf_sock_init); - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); - -static void bpf_tcp_release(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; - } -out: - rcu_read_unlock(); -} - -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} - -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; -} - -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) -{ - atomic_dec(&htab->count); - kfree_rcu(l, rcu); -} - -static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, - struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - - spin_lock_bh(&psock->maps_lock); - e = list_first_entry_or_null(&psock->maps, - struct smap_psock_map_entry, - list); - if (e) - list_del(&e->list); - spin_unlock_bh(&psock->maps_lock); - return e; -} - -static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - struct sk_msg_buff *md, *mtmp; - struct sock *osk; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - e = psock_map_pop(sk, psock); - while (e) { - if (e->entry) { - struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); - - raw_spin_lock_bh(&stab->lock); - osk = *e->entry; - if (osk == sk) { - *e->entry = NULL; - smap_release_sock(psock, sk); - } - raw_spin_unlock_bh(&stab->lock); - } else { - struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - struct bucket *b; - - b = __select_bucket(htab, link->hash); - head = &b->head; - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, - link->hash, link->key, - htab->map.key_size); - /* If another thread deleted this object skip deletion. - * The refcnt on psock may or may not be zero. - */ - if (l && l == link) { - hlist_del_rcu(&link->hash_node); - smap_release_sock(psock, link->sk); - free_htab_elem(htab, link); - } - raw_spin_unlock_bh(&b->lock); - } - kfree(e); - e = psock_map_pop(sk, psock); - } -} - -static void bpf_tcp_unhash(struct sock *sk) -{ - void (*unhash_fun)(struct sock *sk); - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; - } - unhash_fun = psock->save_unhash; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - unhash_fun(sk); -} - -static void bpf_tcp_close(struct sock *sk, long timeout) -{ - void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock *psock; - - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - close_fun = psock->save_close; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - release_sock(sk); - close_fun(sk, timeout); -} - -enum __sk_action { - __SK_DROP = 0, - __SK_PASS, - __SK_REDIRECT, - __SK_NONE, -}; - -static int memcopy_from_iter(struct sock *sk, - struct sk_msg_buff *md, - struct iov_iter *from, int bytes) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_curr, rc = -ENOSPC; - - do { - int copy; - char *to; - - if (md->sg_copybreak >= sg[i].length) { - md->sg_copybreak = 0; - - if (++i == MAX_SKB_FRAGS) - i = 0; - - if (i == md->sg_end) - break; - } - - copy = sg[i].length - md->sg_copybreak; - to = sg_virt(&sg[i]) + md->sg_copybreak; - md->sg_copybreak += copy; - - if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) - rc = copy_from_iter_nocache(to, copy, from); - else - rc = copy_from_iter(to, copy, from); - - if (rc != copy) { - rc = -EFAULT; - goto out; - } - - bytes -= copy; - if (!bytes) - break; - - md->sg_copybreak = 0; - if (++i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -out: - md->sg_curr = i; - return rc; -} - -static int bpf_tcp_push(struct sock *sk, int apply_bytes, - struct sk_msg_buff *md, - int flags, bool uncharge) -{ - bool apply = apply_bytes; - struct scatterlist *sg; - int offset, ret = 0; - struct page *p; - size_t size; - - while (1) { - sg = md->sg_data + md->sg_start; - size = (apply && apply_bytes < sg->length) ? - apply_bytes : sg->length; - offset = sg->offset; - - tcp_rate_check_app_limited(sk); - p = sg_page(sg); -retry: - ret = do_tcp_sendpages(sk, p, offset, size, flags); - if (ret != size) { - if (ret > 0) { - if (apply) - apply_bytes -= ret; - - sg->offset += ret; - sg->length -= ret; - size -= ret; - offset += ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - goto retry; - } - - return ret; - } - - if (apply) - apply_bytes -= ret; - sg->offset += ret; - sg->length -= ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - - if (!sg->length) { - put_page(p); - md->sg_start++; - if (md->sg_start == MAX_SKB_FRAGS) - md->sg_start = 0; - sg_init_table(sg, 1); - - if (md->sg_start == md->sg_end) - break; - } - - if (apply && !apply_bytes) - break; - } - return 0; -} - -static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data + md->sg_start; - - if (md->sg_copy[md->sg_start]) { - md->data = md->data_end = 0; - } else { - md->data = sg_virt(sg); - md->data_end = md->data + sg->length; - } -} - -static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start; - - do { - int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; - - sk_mem_uncharge(sk, uncharge); - bytes -= uncharge; - if (!bytes) - break; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -} - -static void free_bytes_sg(struct sock *sk, int bytes, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start, free; - - while (bytes && sg[i].length) { - free = sg[i].length; - if (bytes < free) { - sg[i].length -= bytes; - sg[i].offset += bytes; - if (charge) - sk_mem_uncharge(sk, bytes); - break; - } - - if (charge) - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - bytes -= sg[i].length; - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - md->sg_start = i; -} - -static int free_sg(struct sock *sk, int start, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = start, free = 0; - - while (sg[i].length) { - free += sg[i].length; - if (charge) - sk_mem_uncharge(sk, sg[i].length); - if (!md->skb) - put_page(sg_page(&sg[i])); - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - consume_skb(md->skb); - - return free; -} - -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) -{ - int free = free_sg(sk, md->sg_start, md, charge); - - md->sg_start = md->sg_end; - return free; -} - -static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) -{ - return free_sg(sk, md->sg_curr, md, true); -} - -static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) -{ - return ((_rc == SK_PASS) ? - (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP); -} - -static unsigned int smap_do_tx_msg(struct sock *sk, - struct smap_psock *psock, - struct sk_msg_buff *md) -{ - struct bpf_prog *prog; - unsigned int rc, _rc; - - preempt_disable(); - rcu_read_lock(); - - /* If the policy was removed mid-send then default to 'accept' */ - prog = READ_ONCE(psock->bpf_tx_msg); - if (unlikely(!prog)) { - _rc = SK_PASS; - goto verdict; - } - - bpf_compute_data_pointers_sg(md); - md->sk = sk; - rc = (*prog->bpf_func)(md, prog->insnsi); - psock->apply_bytes = md->apply_bytes; - - /* Moving return codes from UAPI namespace into internal namespace */ - _rc = bpf_map_msg_verdict(rc, md); - - /* The psock has a refcount on the sock but not on the map and because - * we need to drop rcu read lock here its possible the map could be - * removed between here and when we need it to execute the sock - * redirect. So do the map lookup now for future use. - */ - if (_rc == __SK_REDIRECT) { - if (psock->sk_redir) - sock_put(psock->sk_redir); - psock->sk_redir = do_msg_redirect_map(md); - if (!psock->sk_redir) { - _rc = __SK_DROP; - goto verdict; - } - sock_hold(psock->sk_redir); - } -verdict: - rcu_read_unlock(); - preempt_enable(); - - return _rc; -} - -static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, - struct smap_psock *psock, - struct sk_msg_buff *md, int flags) -{ - bool apply = apply_bytes; - size_t size, copied = 0; - struct sk_msg_buff *r; - int err = 0, i; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!r)) - return -ENOMEM; - - lock_sock(sk); - r->sg_start = md->sg_start; - i = md->sg_start; - - do { - size = (apply && apply_bytes < md->sg_data[i].length) ? - apply_bytes : md->sg_data[i].length; - - if (!sk_wmem_schedule(sk, size)) { - if (!copied) - err = -ENOMEM; - break; - } - - sk_mem_charge(sk, size); - r->sg_data[i] = md->sg_data[i]; - r->sg_data[i].length = size; - md->sg_data[i].length -= size; - md->sg_data[i].offset += size; - copied += size; - - if (md->sg_data[i].length) { - get_page(sg_page(&r->sg_data[i])); - r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; - } else { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - r->sg_end = i; - } - - if (apply) { - apply_bytes -= size; - if (!apply_bytes) - break; - } - } while (i != md->sg_end); - - md->sg_start = i; - - if (!err) { - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - } else { - free_start_sg(sk, r, true); - kfree(r); - } - - release_sock(sk); - return err; -} - -static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, - struct sk_msg_buff *md, - int flags) -{ - bool ingress = !!(md->flags & BPF_F_INGRESS); - struct smap_psock *psock; - int err = 0; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out_rcu; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto out_rcu; - - rcu_read_unlock(); - - if (ingress) { - err = bpf_tcp_ingress(sk, send, psock, md, flags); - } else { - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); - } - smap_release_sock(psock, sk); - return err; -out_rcu: - rcu_read_unlock(); - return 0; -} - -static inline void bpf_md_init(struct smap_psock *psock) -{ - if (!psock->apply_bytes) { - psock->eval = __SK_NONE; - if (psock->sk_redir) { - sock_put(psock->sk_redir); - psock->sk_redir = NULL; - } - } -} - -static void apply_bytes_dec(struct smap_psock *psock, int i) -{ - if (psock->apply_bytes) { - if (psock->apply_bytes < i) - psock->apply_bytes = 0; - else - psock->apply_bytes -= i; - } -} - -static int bpf_exec_tx_verdict(struct smap_psock *psock, - struct sk_msg_buff *m, - struct sock *sk, - int *copied, int flags) -{ - bool cork = false, enospc = (m->sg_start == m->sg_end); - struct sock *redir; - int err = 0; - int send; - -more_data: - if (psock->eval == __SK_NONE) - psock->eval = smap_do_tx_msg(sk, psock, m); - - if (m->cork_bytes && - m->cork_bytes > psock->sg_size && !enospc) { - psock->cork_bytes = m->cork_bytes - psock->sg_size; - if (!psock->cork) { - psock->cork = kcalloc(1, - sizeof(struct sk_msg_buff), - GFP_ATOMIC | __GFP_NOWARN); - - if (!psock->cork) { - err = -ENOMEM; - goto out_err; - } - } - memcpy(psock->cork, m, sizeof(*m)); - goto out_err; - } - - send = psock->sg_size; - if (psock->apply_bytes && psock->apply_bytes < send) - send = psock->apply_bytes; - - switch (psock->eval) { - case __SK_PASS: - err = bpf_tcp_push(sk, send, m, flags, true); - if (unlikely(err)) { - *copied -= free_start_sg(sk, m, true); - break; - } - - apply_bytes_dec(psock, send); - psock->sg_size -= send; - break; - case __SK_REDIRECT: - redir = psock->sk_redir; - apply_bytes_dec(psock, send); - - if (psock->cork) { - cork = true; - psock->cork = NULL; - } - - return_mem_sg(sk, send, m); - release_sock(sk); - - err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); - lock_sock(sk); - - if (unlikely(err < 0)) { - int free = free_start_sg(sk, m, false); - - psock->sg_size = 0; - if (!cork) - *copied -= free; - } else { - psock->sg_size -= send; - } - - if (cork) { - free_start_sg(sk, m, true); - psock->sg_size = 0; - kfree(m); - m = NULL; - err = 0; - } - break; - case __SK_DROP: - default: - free_bytes_sg(sk, send, m, true); - apply_bytes_dec(psock, send); - *copied -= send; - psock->sg_size -= send; - err = -EACCES; - break; - } - - if (likely(!err)) { - bpf_md_init(psock); - if (m && - m->sg_data[m->sg_start].page_link && - m->sg_data[m->sg_start].length) - goto more_data; - } - -out_err: - return err; -} - -static int bpf_wait_data(struct sock *sk, - struct smap_psock *psk, int flags, - long timeo, int *err) -{ - int rc; - - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - rc = sk_wait_event(sk, &timeo, - !list_empty(&psk->ingress) || - !skb_queue_empty(&sk->sk_receive_queue), - &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - - return rc; -} - -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct iov_iter *iter = &msg->msg_iter; - struct smap_psock *psock; - int copied = 0; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) - goto out; - rcu_read_unlock(); - - lock_sock(sk); -bytes_ready: - while (copied != len) { - struct scatterlist *sg; - struct sk_msg_buff *md; - int i; - - md = list_first_entry_or_null(&psock->ingress, - struct sk_msg_buff, list); - if (unlikely(!md)) - break; - i = md->sg_start; - do { - struct page *page; - int n, copy; - - sg = &md->sg_data[i]; - copy = sg->length; - page = sg_page(sg); - - if (copied + copy > len) - copy = len - copied; - - n = copy_page_to_iter(page, sg->offset, copy, iter); - if (n != copy) { - md->sg_start = i; - release_sock(sk); - smap_release_sock(psock, sk); - return -EFAULT; - } - - copied += copy; - sg->offset += copy; - sg->length -= copy; - sk_mem_uncharge(sk, copy); - - if (!sg->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!md->skb) - put_page(page); - } - if (copied == len) - break; - } while (i != md->sg_end); - md->sg_start = i; - - if (!sg->length && md->sg_start == md->sg_end) { - list_del(&md->list); - consume_skb(md->skb); - kfree(md); - } - } - - if (!copied) { - long timeo; - int data; - int err = 0; - - timeo = sock_rcvtimeo(sk, nonblock); - data = bpf_wait_data(sk, psock, flags, timeo, &err); - - if (data) { - if (!skb_queue_empty(&sk->sk_receive_queue)) { - release_sock(sk); - smap_release_sock(psock, sk); - copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - return copied; - } - goto bytes_ready; - } - - if (err) - copied = err; - } - - release_sock(sk); - smap_release_sock(psock, sk); - return copied; -out: - rcu_read_unlock(); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -} - - -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; - struct sk_msg_buff md = {0}; - unsigned int sg_copy = 0; - struct smap_psock *psock; - int copied = 0, err = 0; - struct scatterlist *sg; - long timeo; - - /* Its possible a sock event or user removed the psock _but_ the ops - * have not been reprogrammed yet so we get here. In this case fallback - * to tcp_sendmsg. Note this only works because we _only_ ever allow - * a single ULP there is no hierarchy here. - */ - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - /* Increment the psock refcnt to ensure its not released while sending a - * message. Required because sk lookup and bpf programs are used in - * separate rcu critical sections. Its OK if we lose the map entry - * but we can't lose the sock reference. - */ - if (!refcount_inc_not_zero(&psock->refcnt)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - sg = md.sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - rcu_read_unlock(); - - lock_sock(sk); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - - while (msg_data_left(msg)) { - struct sk_msg_buff *m = NULL; - bool enospc = false; - int copy; - - if (sk->sk_err) { - err = -sk->sk_err; - goto out_err; - } - - copy = msg_data_left(msg); - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - - m = psock->cork_bytes ? psock->cork : &md; - m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; - err = sk_alloc_sg(sk, copy, m->sg_data, - m->sg_start, &m->sg_end, &sg_copy, - m->sg_end - 1); - if (err) { - if (err != -ENOSPC) - goto wait_for_memory; - enospc = true; - copy = sg_copy; - } - - err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); - if (err < 0) { - free_curr_sg(sk, m); - goto out_err; - } - - psock->sg_size += copy; - copied += copy; - sg_copy = 0; - - /* When bytes are being corked skip running BPF program and - * applying verdict unless there is no more buffer space. In - * the ENOSPC case simply run BPF prorgram with currently - * accumulated data. We don't have much choice at this point - * we could try extending the page frags or chaining complex - * frags but even in these cases _eventually_ we will hit an - * OOM scenario. More complex recovery schemes may be - * implemented in the future, but BPF programs must handle - * the case where apply_cork requests are not honored. The - * canonical method to verify this is to check data length. - */ - if (psock->cork_bytes) { - if (copy > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= copy; - - if (psock->cork_bytes && !enospc) - goto out_cork; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); - if (unlikely(err < 0)) - goto out_err; - continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - err = sk_stream_wait_memory(sk, &timeo); - if (err) { - if (m && m != psock->cork) - free_start_sg(sk, m, true); - goto out_err; - } - } -out_err: - if (err < 0) - err = sk_stream_error(sk, msg->msg_flags, err); -out_cork: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -} - -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct sk_msg_buff md = {0}, *m = NULL; - int err = 0, copied = 0; - struct smap_psock *psock; - struct scatterlist *sg; - bool enospc = false; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto accept; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto accept; - rcu_read_unlock(); - - lock_sock(sk); - - if (psock->cork_bytes) { - m = psock->cork; - sg = &m->sg_data[m->sg_end]; - } else { - m = &md; - sg = m->sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(m->sg_end == m->sg_start && - m->sg_data[m->sg_end].length)) - goto out_err; - - psock->sg_size += size; - sg_set_page(sg, page, size, offset); - get_page(page); - m->sg_copy[m->sg_end] = true; - sk_mem_charge(sk, size); - m->sg_end++; - copied = size; - - if (m->sg_end == MAX_SKB_FRAGS) - m->sg_end = 0; - - if (m->sg_end == m->sg_start) - enospc = true; - - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - - if (psock->cork_bytes && !enospc) - goto out_err; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); -out_err: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -accept: - rcu_read_unlock(); - return tcp_sendpage(sk, page, offset, size, flags); -} - -static void bpf_tcp_msg_add(struct smap_psock *psock, - struct sock *sk, - struct bpf_prog *tx_msg) -{ - struct bpf_prog *orig_tx_msg; - - orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); - if (orig_tx_msg) - bpf_prog_put(orig_tx_msg); -} - -static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) -{ - struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); - int rc; - - if (unlikely(!prog)) - return __SK_DROP; - - skb_orphan(skb); - /* We need to ensure that BPF metadata for maps is also cleared - * when we orphan the skb so that we don't have the possibility - * to reference a stale map. - */ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - preempt_disable(); - rc = (*prog->bpf_func)(skb, prog->insnsi); - preempt_enable(); - skb->sk = NULL; - - /* Moving return codes from UAPI namespace into internal namespace */ - return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP; -} - -static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) -{ - struct sock *sk = psock->sock; - int copied = 0, num_sg; - struct sk_msg_buff *r; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); - if (unlikely(!r)) - return -EAGAIN; - - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(r); - return -EAGAIN; - } - - sg_init_table(r->sg_data, MAX_SKB_FRAGS); - num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); - if (unlikely(num_sg < 0)) { - kfree(r); - return num_sg; - } - sk_mem_charge(sk, skb->len); - copied = skb->len; - r->sg_start = 0; - r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; - r->skb = skb; - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - return copied; -} - -static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) -{ - struct smap_psock *peer; - struct sock *sk; - __u32 in; - int rc; - - rc = smap_verdict_func(psock, skb); - switch (rc) { - case __SK_REDIRECT: - sk = do_sk_redirect_map(skb); - if (!sk) { - kfree_skb(skb); - break; - } - - peer = smap_psock_sk(sk); - in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - - if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || - !test_bit(SMAP_TX_RUNNING, &peer->state))) { - kfree_skb(skb); - break; - } - - if (!in && sock_writeable(sk)) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } else if (in && - atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } - /* Fall through and free skb otherwise */ - case __SK_DROP: - default: - kfree_skb(skb); - } -} - -static void smap_report_sk_error(struct smap_psock *psock, int err) -{ - struct sock *sk = psock->sock; - - sk->sk_err = err; - sk->sk_error_report(sk); -} - -static void smap_read_sock_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - smap_do_verdict(psock, skb); - rcu_read_unlock(); -} - -/* Called with lock held on socket */ -static void smap_data_ready(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); - } - rcu_read_unlock(); -} - -static void smap_tx_work(struct work_struct *w) -{ - struct smap_psock *psock; - struct sk_buff *skb; - int rem, off, n; - - psock = container_of(w, struct smap_psock, tx_work); - - /* lock sock to avoid losing sk_socket at some point during loop */ - lock_sock(psock->sock); - if (psock->save_skb) { - skb = psock->save_skb; - rem = psock->save_rem; - off = psock->save_off; - psock->save_skb = NULL; - goto start; - } - - while ((skb = skb_dequeue(&psock->rxqueue))) { - __u32 flags; - - rem = skb->len; - off = 0; -start: - flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - do { - if (likely(psock->sock->sk_socket)) { - if (flags) - n = smap_do_ingress(psock, skb); - else - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - } else { - n = -EINVAL; - } - - if (n <= 0) { - if (n == -EAGAIN) { - /* Retry when space is available */ - psock->save_skb = skb; - psock->save_rem = rem; - psock->save_off = off; - goto out; - } - /* Hard errors break pipe and stop xmit */ - smap_report_sk_error(psock, n ? -n : EPIPE); - clear_bit(SMAP_TX_RUNNING, &psock->state); - kfree_skb(skb); - goto out; - } - rem -= n; - off += n; - } while (rem); - - if (!flags) - kfree_skb(skb); - } -out: - release_sock(psock->sock); -} - -static void smap_write_space(struct sock *sk) -{ - struct smap_psock *psock; - void (*write_space)(struct sock *sk); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) - schedule_work(&psock->tx_work); - write_space = psock->save_write_space; - rcu_read_unlock(); - write_space(sk); -} - -static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) -{ - if (!psock->strp_enabled) - return; - sk->sk_data_ready = psock->save_data_ready; - sk->sk_write_space = psock->save_write_space; - psock->save_data_ready = NULL; - psock->save_write_space = NULL; - strp_stop(&psock->strp); - psock->strp_enabled = false; -} - -static void smap_destroy_psock(struct rcu_head *rcu) -{ - struct smap_psock *psock = container_of(rcu, - struct smap_psock, rcu); - - /* Now that a grace period has passed there is no longer - * any reference to this sock in the sockmap so we can - * destroy the psock, strparser, and bpf programs. But, - * because we use workqueue sync operations we can not - * do it in rcu context - */ - schedule_work(&psock->gc_work); -} - -static bool psock_is_smap_sk(struct sock *sk) -{ - return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock) -{ - if (refcount_dec_and_test(&psock->refcnt)) { - if (psock_is_smap_sk(sock)) - bpf_tcp_release(sock); - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); - } -} - -static int smap_parse_func_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - struct bpf_prog *prog; - int rc; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - prog = READ_ONCE(psock->bpf_parse); - - if (unlikely(!prog)) { - rcu_read_unlock(); - return skb->len; - } - - /* Attach socket for bpf program to use if needed we can do this - * because strparser clones the skb before handing it to a upper - * layer, meaning skb_orphan has been called. We NULL sk on the - * way out to ensure we don't trigger a BUG_ON in skb/sk operations - * later and because we are not charging the memory of this skb to - * any socket yet. - */ - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - rc = (*prog->bpf_func)(skb, prog->insnsi); - skb->sk = NULL; - rcu_read_unlock(); - return rc; -} - -static int smap_read_sock_done(struct strparser *strp, int err) -{ - return err; -} - -static int smap_init_sock(struct smap_psock *psock, - struct sock *sk) -{ - static const struct strp_callbacks cb = { - .rcv_msg = smap_read_sock_strparser, - .parse_msg = smap_parse_func_strparser, - .read_sock_done = smap_read_sock_done, - }; - - return strp_init(&psock->strp, sk, &cb); -} - -static void smap_init_progs(struct smap_psock *psock, - struct bpf_prog *verdict, - struct bpf_prog *parse) -{ - struct bpf_prog *orig_parse, *orig_verdict; - - orig_parse = xchg(&psock->bpf_parse, parse); - orig_verdict = xchg(&psock->bpf_verdict, verdict); - - if (orig_verdict) - bpf_prog_put(orig_verdict); - if (orig_parse) - bpf_prog_put(orig_parse); -} - -static void smap_start_sock(struct smap_psock *psock, struct sock *sk) -{ - if (sk->sk_data_ready == smap_data_ready) - return; - psock->save_data_ready = sk->sk_data_ready; - psock->save_write_space = sk->sk_write_space; - sk->sk_data_ready = smap_data_ready; - sk->sk_write_space = smap_write_space; - psock->strp_enabled = true; -} - -static void sock_map_remove_complete(struct bpf_stab *stab) -{ - bpf_map_area_free(stab->sock_map); - kfree(stab); -} - -static void smap_gc_work(struct work_struct *w) -{ - struct smap_psock_map_entry *e, *tmp; - struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; - - psock = container_of(w, struct smap_psock, gc_work); - - /* no callback lock needed because we already detached sockmap ops */ - if (psock->strp_enabled) - strp_done(&psock->strp); - - cancel_work_sync(&psock->tx_work); - __skb_queue_purge(&psock->rxqueue); - - /* At this point all strparser and xmit work must be complete */ - if (psock->bpf_parse) - bpf_prog_put(psock->bpf_parse); - if (psock->bpf_verdict) - bpf_prog_put(psock->bpf_verdict); - if (psock->bpf_tx_msg) - bpf_prog_put(psock->bpf_tx_msg); - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - list_del(&e->list); - kfree(e); - } - - if (psock->sk_redir) - sock_put(psock->sk_redir); - - sock_put(psock->sock); - kfree(psock); -} - -static struct smap_psock *smap_init_psock(struct sock *sock, int node) -{ - struct smap_psock *psock; - - psock = kzalloc_node(sizeof(struct smap_psock), - GFP_ATOMIC | __GFP_NOWARN, - node); - if (!psock) - return ERR_PTR(-ENOMEM); - - psock->eval = __SK_NONE; - psock->sock = sock; - skb_queue_head_init(&psock->rxqueue); - INIT_WORK(&psock->tx_work, smap_tx_work); - INIT_WORK(&psock->gc_work, smap_gc_work); - INIT_LIST_HEAD(&psock->maps); - INIT_LIST_HEAD(&psock->ingress); - refcount_set(&psock->refcnt, 1); - spin_lock_init(&psock->maps_lock); - - rcu_assign_sk_user_data(sock, psock); - sock_hold(sock); - return psock; -} - -static struct bpf_map *sock_map_alloc(union bpf_attr *attr) -{ - struct bpf_stab *stab; - u64 cost; - int err; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - stab = kzalloc(sizeof(*stab), GFP_USER); - if (!stab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&stab->map, attr); - raw_spin_lock_init(&stab->lock); - - /* make sure page count doesn't overflow */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = -EINVAL; - if (cost >= U32_MAX - PAGE_SIZE) - goto free_stab; - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(stab->map.pages); - if (err) - goto free_stab; - - err = -ENOMEM; - stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *), - stab->map.numa_node); - if (!stab->sock_map) - goto free_stab; - - return &stab->map; -free_stab: - kfree(stab); - return ERR_PTR(err); -} - -static void smap_list_map_remove(struct smap_psock *psock, - struct sock **entry) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void smap_list_hash_remove(struct smap_psock *psock, - struct htab_elem *hash_link) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - struct htab_elem *c = rcu_dereference(e->hash_link); - - if (c == hash_link) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void sock_map_free(struct bpf_map *map) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - raw_spin_lock_bh(&stab->lock); - for (i = 0; i < stab->map.max_entries; i++) { - struct smap_psock *psock; - struct sock *sock; - - sock = stab->sock_map[i]; - if (!sock) - continue; - stab->sock_map[i] = NULL; - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); - } - } - raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); - - sock_map_remove_complete(stab); -} - -static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - u32 i = key ? *(u32 *)key : U32_MAX; - u32 *next = (u32 *)next_key; - - if (i >= stab->map.max_entries) { - *next = 0; - return 0; - } - - if (i == stab->map.max_entries - 1) - return -ENOENT; - - *next = i + 1; - return 0; -} - -struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - if (key >= map->max_entries) - return NULL; - - return READ_ONCE(stab->sock_map[key]); -} - -static int sock_map_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock *psock; - int k = *(u32 *)key; - struct sock *sock; - - if (k >= map->max_entries) - return -EINVAL; - - raw_spin_lock_bh(&stab->lock); - sock = stab->sock_map[k]; - stab->sock_map[k] = NULL; - raw_spin_unlock_bh(&stab->lock); - if (!sock) - return -EINVAL; - - psock = smap_psock_sk(sock); - if (!psock) - return 0; - if (psock->bpf_parse) { - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - smap_list_map_remove(psock, &stab->sock_map[k]); - smap_release_sock(psock, sock); - return 0; -} - -/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are - * done inside rcu critical sections. This ensures on updates that the psock - * will not be released via smap_release_sock() until concurrent updates/deletes - * complete. All operations operate on sock_map using cmpxchg and xchg - * operations to ensure we do not get stale references. Any reads into the - * map must be done with READ_ONCE() because of this. - * - * A psock is destroyed via call_rcu and after any worker threads are cancelled - * and syncd so we are certain all references from the update/lookup/delete - * operations as well as references in the data path are no longer in use. - * - * Psocks may exist in multiple maps, but only a single set of parse/verdict - * programs may be inherited from the maps it belongs to. A reference count - * is kept with the total number of references to the psock from all maps. The - * psock will not be released until this reaches zero. The psock and sock - * user data data use the sk_callback_lock to protect critical data structures - * from concurrent access. This allows us to avoid two updates from modifying - * the user data in sock and the lock is required anyways for modifying - * callbacks, we simply increase its scope slightly. - * - * Rules to follow, - * - psock must always be read inside RCU critical section - * - sk_user_data must only be modified inside sk_callback_lock and read - * inside RCU critical section. - * - psock->maps list must only be read & modified inside sk_callback_lock - * - sock_map must use READ_ONCE and (cmp)xchg operations - * - BPF verdict/parse programs must use READ_ONCE and xchg operations - */ - -static int __sock_map_ctx_update_elem(struct bpf_map *map, - struct bpf_sock_progs *progs, - struct sock *sock, - void *key) -{ - struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock *psock; - bool new = false; - int err = 0; - - /* 1. If sock map has BPF programs those will be inherited by the - * sock being added. If the sock is already attached to BPF programs - * this results in an error. - */ - verdict = READ_ONCE(progs->bpf_verdict); - parse = READ_ONCE(progs->bpf_parse); - tx_msg = READ_ONCE(progs->bpf_tx_msg); - - if (parse && verdict) { - /* bpf prog refcnt may be zero if a concurrent attach operation - * removes the program after the above READ_ONCE() but before - * we increment the refcnt. If this is the case abort with an - * error. - */ - verdict = bpf_prog_inc_not_zero(verdict); - if (IS_ERR(verdict)) - return PTR_ERR(verdict); - - parse = bpf_prog_inc_not_zero(parse); - if (IS_ERR(parse)) { - bpf_prog_put(verdict); - return PTR_ERR(parse); - } - } - - if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(tx_msg); - if (IS_ERR(tx_msg)) { - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - return PTR_ERR(tx_msg); - } - } - - psock = smap_psock_sk(sock); - - /* 2. Do not allow inheriting programs if psock exists and has - * already inherited programs. This would create confusion on - * which parser/verdict program is running. If no psock exists - * create one. Inside sk_callback_lock to ensure concurrent create - * doesn't update user data. - */ - if (psock) { - if (!psock_is_smap_sk(sock)) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_parse) && parse) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { - err = -EBUSY; - goto out_progs; - } - if (!refcount_inc_not_zero(&psock->refcnt)) { - err = -EAGAIN; - goto out_progs; - } - } else { - psock = smap_init_psock(sock, map->numa_node); - if (IS_ERR(psock)) { - err = PTR_ERR(psock); - goto out_progs; - } - - set_bit(SMAP_TX_RUNNING, &psock->state); - new = true; - } - - /* 3. At this point we have a reference to a valid psock that is - * running. Attach any BPF programs needed. - */ - if (tx_msg) - bpf_tcp_msg_add(psock, sock, tx_msg); - if (new) { - err = bpf_tcp_init(sock); - if (err) - goto out_free; - } - - if (parse && verdict && !psock->strp_enabled) { - err = smap_init_sock(psock, sock); - if (err) - goto out_free; - smap_init_progs(psock, verdict, parse); - write_lock_bh(&sock->sk_callback_lock); - smap_start_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - - return err; -out_free: - smap_release_sock(psock, sock); -out_progs: - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - if (tx_msg) - bpf_prog_put(tx_msg); - return err; -} - -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock = skops->sk; - struct smap_psock_map_entry *e; - struct smap_psock *psock; - u32 i = *(u32 *)key; - int err; - - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto out; - - /* psock guaranteed to be present. */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&stab->lock); - osock = stab->sock_map[i]; - if (osock && flags == BPF_NOEXIST) { - err = -EEXIST; - goto out_unlock; - } - if (!osock && flags == BPF_EXIST) { - err = -ENOENT; - goto out_unlock; - } - - e->entry = &stab->sock_map[i]; - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - stab->sock_map[i] = sock; - if (osock) { - psock = smap_psock_sk(osock); - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, osock); - } - raw_spin_unlock_bh(&stab->lock); - return 0; -out_unlock: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&stab->lock); -out: - kfree(e); - return err; -} - -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } else { - return -EINVAL; - } - - switch (type) { - case BPF_SK_MSG_VERDICT: - orig = xchg(&progs->bpf_tx_msg, prog); - break; - case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&progs->bpf_parse, prog); - break; - case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&progs->bpf_verdict, prog); - break; - default: - return -EOPNOTSUPP; - } - - if (orig) - bpf_prog_put(orig); - - return 0; -} - -int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog) -{ - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - err = sock_map_prog(map, prog, attr->attach_type); - fdput(f); - return err; -} - -static void *sock_map_lookup(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int sock_map_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. - */ - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP || - skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_map_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static void sock_map_release(struct bpf_map *map) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } - - orig = xchg(&progs->bpf_parse, NULL); - if (orig) - bpf_prog_put(orig); - orig = xchg(&progs->bpf_verdict, NULL); - if (orig) - bpf_prog_put(orig); - - orig = xchg(&progs->bpf_tx_msg, NULL); - if (orig) - bpf_prog_put(orig); -} - -static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) -{ - struct bpf_htab *htab; - int i, err; - u64 cost; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || - attr->key_size == 0 || - attr->value_size != 4 || - attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return ERR_PTR(-E2BIG); - - htab = kzalloc(sizeof(*htab), GFP_USER); - if (!htab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&htab->map, attr); - - htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8); - err = -EINVAL; - if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct bucket)) - goto free_htab; - - cost = (u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries; - - if (cost >= U32_MAX - PAGE_SIZE) - goto free_htab; - - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(htab->map.pages); - if (err) - goto free_htab; - - err = -ENOMEM; - htab->buckets = bpf_map_area_alloc( - htab->n_buckets * sizeof(struct bucket), - htab->map.numa_node); - if (!htab->buckets) - goto free_htab; - - for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); - raw_spin_lock_init(&htab->buckets[i].lock); - } - - return &htab->map; -free_htab: - kfree(htab); - return ERR_PTR(err); -} - -static void __bpf_htab_free(struct rcu_head *rcu) -{ - struct bpf_htab *htab; - - htab = container_of(rcu, struct bpf_htab, rcu); - bpf_map_area_free(htab->buckets); - kfree(htab); -} - -static void sock_hash_free(struct bpf_map *map) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - for (i = 0; i < htab->n_buckets; i++) { - struct bucket *b = __select_bucket(htab, i); - struct hlist_head *head; - struct hlist_node *n; - struct htab_elem *l; - - raw_spin_lock_bh(&b->lock); - head = &b->head; - hlist_for_each_entry_safe(l, n, head, hash_node) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get - * the sk_callback_lock before this case but after xchg - * causing the refcnt to hit zero and sock user data - * (psock) to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - } - raw_spin_unlock_bh(&b->lock); - } - rcu_read_unlock(); - call_rcu(&htab->rcu, __bpf_htab_free); -} - -static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, - void *key, u32 key_size, u32 hash, - struct sock *sk, - struct htab_elem *old_elem) -{ - struct htab_elem *l_new; - - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - if (!old_elem) { - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); - } - } - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); - if (!l_new) { - atomic_dec(&htab->count); - return ERR_PTR(-ENOMEM); - } - - memcpy(l_new->key, key, key_size); - l_new->sk = sk; - l_new->hash = hash; - return l_new; -} - -static inline u32 htab_map_hash(const void *key, u32 key_len) -{ - return jhash(key, key_len, 0); -} - -static int sock_hash_get_next_key(struct bpf_map *map, - void *key, void *next_key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l, *next_l; - struct hlist_head *h; - u32 hash, key_size; - int i = 0; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - key_size = map->key_size; - if (!key) - goto find_first_elem; - hash = htab_map_hash(key, key_size); - h = select_bucket(htab, hash); - - l = lookup_elem_raw(h, hash, key, key_size); - if (!l) - goto find_first_elem; - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), - struct htab_elem, hash_node); - if (next_l) { - memcpy(next_key, next_l->key, key_size); - return 0; - } - - /* no more elements in this hash list, go to the next bucket */ - i = hash & (htab->n_buckets - 1); - i++; - -find_first_elem: - /* iterate over buckets */ - for (; i < htab->n_buckets; i++) { - h = select_bucket(htab, i); - - /* pick first element in the bucket */ - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(h)), - struct htab_elem, hash_node); - if (next_l) { - /* if it's not empty, just return it */ - memcpy(next_key, next_l->key, key_size); - return 0; - } - } - - /* iterated over all buckets and all elements */ - return -ENOENT; -} - -static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 map_flags) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_sock_progs *progs = &htab->progs; - struct htab_elem *l_new = NULL, *l_old; - struct smap_psock_map_entry *e = NULL; - struct hlist_head *head; - struct smap_psock *psock; - u32 key_size, hash; - struct sock *sock; - struct bucket *b; - int err; - - sock = skops->sk; - - if (sock->sk_type != SOCK_STREAM || - sock->sk_protocol != IPPROTO_TCP) - return -EOPNOTSUPP; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - WARN_ON_ONCE(!rcu_read_lock_held()); - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto err; - - /* psock is valid here because otherwise above *ctx_update_elem would - * have thrown an error. It is safe to skip error check. - */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&b->lock); - l_old = lookup_elem_raw(head, hash, key, key_size); - if (l_old && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto bucket_err; - } - if (!l_old && map_flags == BPF_EXIST) { - err = -ENOENT; - goto bucket_err; - } - - l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); - if (IS_ERR(l_new)) { - err = PTR_ERR(l_new); - goto bucket_err; - } - - rcu_assign_pointer(e->hash_link, l_new); - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - /* add new element to the head of the list, so that - * concurrent search will find it before old elem - */ - hlist_add_head_rcu(&l_new->hash_node, head); - if (l_old) { - psock = smap_psock_sk(l_old->sk); - - hlist_del_rcu(&l_old->hash_node); - smap_list_hash_remove(psock, l_old); - smap_release_sock(psock, l_old->sk); - free_htab_elem(htab, l_old); - } - raw_spin_unlock_bh(&b->lock); - return 0; -bucket_err: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&b->lock); -err: - kfree(e); - return err; -} - -static int sock_hash_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. - */ - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP || - skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_hash_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static int sock_hash_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct bucket *b; - struct htab_elem *l; - u32 hash, key_size; - int ret = -ENOENT; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, hash, key, key_size); - if (l) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - ret = 0; - } - raw_spin_unlock_bh(&b->lock); - return ret; -} - -struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - u32 key_size, hash; - struct bucket *b; - struct sock *sk; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - l = lookup_elem_raw(head, hash, key, key_size); - sk = l ? l->sk : NULL; - return sk; -} - -const struct bpf_map_ops sock_map_ops = { - .map_alloc = sock_map_alloc, - .map_free = sock_map_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_map_get_next_key, - .map_update_elem = sock_map_update_elem, - .map_delete_elem = sock_map_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -const struct bpf_map_ops sock_hash_ops = { - .map_alloc = sock_hash_alloc, - .map_free = sock_hash_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_hash_get_next_key, - .map_update_elem = sock_hash_update_elem, - .map_delete_elem = sock_hash_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) -{ - return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || - ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; -} -BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. This checks that the sock ops triggering the update is - * one indicating we are (or will be soon) in an ESTABLISHED state. - */ - if (!bpf_is_valid_sock_op(bpf_sock)) - return -EOPNOTSUPP; - return sock_map_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_map_update_proto = { - .func = bpf_sock_map_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (!bpf_is_valid_sock_op(bpf_sock)) - return -EOPNOTSUPP; - return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_hash_update_proto = { - .func = bpf_sock_hash_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 53968f82b919..f4ecd6ed2252 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1664,7 +1664,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - ret = sockmap_get_from_fd(attr, ptype, prog); + ret = sock_map_get_from_fd(attr, prog); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); @@ -1718,10 +1718,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: |