diff options
Diffstat (limited to 'include')
44 files changed, 1424 insertions, 287 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 64f367044e25..2f98d2fce62e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -279,6 +279,31 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) +/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a + * fullsock and its parent fullsock cannot be traced by + * sk_to_full_sk(). + * + * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. + * Its listener-sk is not attached to the rsk_listener. + * In this case, the caller holds the listener-sk (unlocked), + * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with + * the listener-sk such that the cgroup-bpf-progs of the + * listener-sk will be run. + * + * Regardless of syncookie mode or not, + * calling bpf_setsockopt on listener-sk will not make sense anyway, + * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. + */ +#define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 55f694b63164..c6d9f2c444f4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -34,6 +34,8 @@ struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; +struct bpf_local_storage; +struct bpf_local_storage_map; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -104,6 +106,25 @@ struct bpf_map_ops { __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); + /* Functions called by bpf_local_storage maps */ + int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); + + /* map_meta_equal must be implemented for maps that can be + * used as an inner map. It is a runtime check to ensure + * an inner map can be inserted to an outer map. + * + * Some properties of the inner map has been used during the + * verification time. When inserting an inner map at the runtime, + * map_meta_equal has to ensure the inserting map has the same + * properties that the verifier has used earlier. + */ + bool (*map_meta_equal)(const struct bpf_map *meta0, + const struct bpf_map *meta1); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -227,6 +248,9 @@ int map_check_no_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ @@ -309,6 +333,7 @@ struct bpf_func_proto { * for this argument. */ int *ret_btf_id; /* return value btf_id */ + bool (*allowed)(const struct bpf_prog *prog); }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -514,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); +void notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(void); struct bpf_ksym { unsigned long start; @@ -709,6 +736,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool sleepable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -1218,12 +1246,18 @@ typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; + bpf_iter_show_fdinfo_t show_fdinfo; + bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; @@ -1250,6 +1284,10 @@ int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); +void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); @@ -1340,6 +1378,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); +bool btf_struct_ids_match(struct bpf_verifier_log *log, + int off, u32 id, u32 need_type_id); int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int); @@ -1358,6 +1398,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); +struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); #else /* !CONFIG_BPF_SYSCALL */ @@ -1637,6 +1678,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else @@ -1658,6 +1700,12 @@ static inline int sock_map_prog_detach(const union bpf_attr *attr, { return -EOPNOTSUPP; } + +static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) @@ -1736,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; +extern const struct bpf_func_proto bpf_copy_from_user_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -1850,4 +1899,7 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +struct btf_id_set; +bool btf_id_set_contains(struct btf_id_set *set, u32 id); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h new file mode 100644 index 000000000000..b2c9463f36a1 --- /dev/null +++ b/include/linux/bpf_local_storage.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#ifndef _BPF_LOCAL_STORAGE_H +#define _BPF_LOCAL_STORAGE_H + +#include <linux/bpf.h> +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <uapi/linux/btf.h> + +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_map_bucket { + struct hlist_head list; + raw_spinlock_t lock; +}; + +/* Thp map is not the primary owner of a bpf_local_storage_elem. + * Instead, the container object (eg. sk->sk_bpf_storage) is. + * + * The map (bpf_local_storage_map) is for two purposes + * 1. Define the size of the "local storage". It is + * the map's value_size. + * + * 2. Maintain a list to keep track of all elems such + * that they can be cleaned up during the map destruction. + * + * When a bpf local storage is being looked up for a + * particular object, the "bpf_map" pointer is actually used + * as the "key" to search in the list of elem in + * the respective bpf_local_storage owned by the object. + * + * e.g. sk->sk_bpf_storage is the mini-map with the "bpf_map" pointer + * as the searching key. + */ +struct bpf_local_storage_map { + struct bpf_map map; + /* Lookup elem does not require accessing the map. + * + * Updating/Deleting requires a bucket lock to + * link/unlink the elem from the map. Having + * multiple buckets to improve contention. + */ + struct bpf_local_storage_map_bucket *buckets; + u32 bucket_log; + u16 elem_size; + u16 cache_idx; +}; + +struct bpf_local_storage_data { + /* smap is used as the searching key when looking up + * from the object's bpf_local_storage. + * + * Put it in the same cacheline as the data to minimize + * the number of cachelines access during the cache hit case. + */ + struct bpf_local_storage_map __rcu *smap; + u8 data[] __aligned(8); +}; + +/* Linked to bpf_local_storage and bpf_local_storage_map */ +struct bpf_local_storage_elem { + struct hlist_node map_node; /* Linked to bpf_local_storage_map */ + struct hlist_node snode; /* Linked to bpf_local_storage */ + struct bpf_local_storage __rcu *local_storage; + struct rcu_head rcu; + /* 8 bytes hole */ + /* The data is stored in aother cacheline to minimize + * the number of cachelines access during a cache hit. + */ + struct bpf_local_storage_data sdata ____cacheline_aligned; +}; + +struct bpf_local_storage { + struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; + struct hlist_head list; /* List of bpf_local_storage_elem */ + void *owner; /* The object that owns the above "list" of + * bpf_local_storage_elem. + */ + struct rcu_head rcu; + raw_spinlock_t lock; /* Protect adding/removing from the "list" */ +}; + +/* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ +#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE \ + min_t(u32, \ + (KMALLOC_MAX_SIZE - MAX_BPF_STACK - \ + sizeof(struct bpf_local_storage_elem)), \ + (U16_MAX - sizeof(struct bpf_local_storage_elem))) + +#define SELEM(_SDATA) \ + container_of((_SDATA), struct bpf_local_storage_elem, sdata) +#define SDATA(_SELEM) (&(_SELEM)->sdata) + +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_cache { + spinlock_t idx_lock; + u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE]; +}; + +#define DEFINE_BPF_STORAGE_CACHE(name) \ +static struct bpf_local_storage_cache name = { \ + .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ +} + +u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); +void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx); + +/* Helper functions for bpf_local_storage */ +int bpf_local_storage_map_alloc_check(union bpf_attr *attr); + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit); + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type); + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); + +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_omem); + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem); + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, + bool charge_mem); + +int +bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem); + +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags); + +#endif /* _BPF_LOCAL_STORAGE_H */ diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index af74712af585..aaacb6aafc87 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -17,9 +17,28 @@ #include <linux/lsm_hook_defs.h> #undef LSM_HOOK +struct bpf_storage_blob { + struct bpf_local_storage __rcu *storage; +}; + +extern struct lsm_blob_sizes bpf_lsm_blob_sizes; + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + if (unlikely(!inode->i_security)) + return NULL; + + return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; +} + +extern const struct bpf_func_proto bpf_inode_storage_get_proto; +extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +void bpf_inode_storage_free(struct inode *inode); + #else /* !CONFIG_BPF_LSM */ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, @@ -28,6 +47,16 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; } +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + return NULL; +} + +static inline void bpf_inode_storage_free(struct inode *inode) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a52a5688418e..2e6f568377f1 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -107,6 +107,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif +#ifdef CONFIG_BPF_LSM +BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/linux/btf.h b/include/linux/btf.h index 8b81fbb4497c..a9af5e7a7ece 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -64,8 +64,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, u32 id, u32 *res_id); const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, - u32 *type_size, const struct btf_type **elem_type, - u32 *total_nelems); + u32 *type_size); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 4867d549e3c1..210b086188a3 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -3,6 +3,11 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +struct btf_id_set { + u32 cnt; + u32 ids[]; +}; + #ifdef CONFIG_DEBUG_INFO_BTF #include <linux/compiler.h> /* for __PASTE */ @@ -62,7 +67,7 @@ asm( \ ".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ "." #scope " " #name "; \n" \ #name ":; \n" \ -".popsection; \n"); \ +".popsection; \n"); #define BTF_ID_LIST(name) \ __BTF_ID_LIST(name, local) \ @@ -88,12 +93,56 @@ asm( \ ".zero 4 \n" \ ".popsection; \n"); +/* + * The BTF_SET_START/END macros pair defines sorted list of + * BTF IDs plus its members count, with following layout: + * + * BTF_SET_START(list) + * BTF_ID(type1, name1) + * BTF_ID(type2, name2) + * BTF_SET_END(list) + * + * __BTF_ID__set__list: + * .zero 4 + * list: + * __BTF_ID__type1__name1__3: + * .zero 4 + * __BTF_ID__type2__name2__4: + * .zero 4 + * + */ +#define __BTF_SET_START(name, scope) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +"." #scope " __BTF_ID__set__" #name "; \n" \ +"__BTF_ID__set__" #name ":; \n" \ +".zero 4 \n" \ +".popsection; \n"); + +#define BTF_SET_START(name) \ +__BTF_ID_LIST(name, local) \ +__BTF_SET_START(name, local) + +#define BTF_SET_START_GLOBAL(name) \ +__BTF_ID_LIST(name, globl) \ +__BTF_SET_START(name, globl) + +#define BTF_SET_END(name) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +".size __BTF_ID__set__" #name ", .-" #name " \n" \ +".popsection; \n"); \ +extern struct btf_id_set name; + #else #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ diff --git a/include/linux/filter.h b/include/linux/filter.h index ebfb7cfb65f1..05b4052715b9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1236,13 +1236,17 @@ struct bpf_sock_addr_kern { struct bpf_sock_ops_kern { struct sock *sk; - u32 op; union { u32 args[4]; u32 reply; u32 replylong[4]; }; - u32 is_fullsock; + struct sk_buff *syn_skb; + struct sk_buff *skb; + void *skb_data_end; + u8 op; + u8 is_fullsock; + u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 5bda8cf457b6..2a7660843444 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -27,9 +27,18 @@ struct tun_xdp_hdr { #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -bool tun_is_xdp_frame(void *ptr); -void *tun_xdp_to_ptr(void *ptr); -void *tun_ptr_to_xdp(void *ptr); +static inline bool tun_is_xdp_frame(void *ptr) +{ + return (unsigned long)ptr & TUN_XDP_FLAG; +} +static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) +{ + return (void *)((unsigned long)xdp | TUN_XDP_FLAG); +} +static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) +{ + return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); +} void tun_ptr_free(void *ptr); #else #include <linux/err.h> @@ -48,11 +57,11 @@ static inline bool tun_is_xdp_frame(void *ptr) { return false; } -static inline void *tun_xdp_to_ptr(void *ptr) +static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } -static inline void *tun_ptr_to_xdp(void *ptr) +static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 0ef2d800fda7..84abb30a3fbb 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -75,6 +75,8 @@ static inline size_t inet_diag_msg_attrs_size(void) #ifdef CONFIG_SOCK_CGROUP_DATA + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ #endif + + nla_total_size(sizeof(struct inet_diag_sockopt)) + /* INET_DIAG_SOCKOPT */ ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a44789d027cc..dda61d150a13 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -177,17 +177,6 @@ static inline int inet6_sdif(const struct sk_buff *skb) return 0; } -/* can not be used in TCP layer after tcp_v6_fill_cb */ -static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if defined(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_tcp_l3mdev_accept && - skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) - return true; -#endif - return false; -} - struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; @@ -345,17 +334,6 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return (struct raw6_sock *)sk; } -static inline void inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - int ancestor_size = sizeof(struct inet_sock); - - if (sk_from->sk_family == PF_INET6) - ancestor_size += sizeof(struct ipv6_pinfo); - - __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); -} - #define __ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_only_sock(sk) (__ipv6_only_sock(sk)) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 898cbf00332a..3a88b699b758 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -358,6 +358,12 @@ static inline int mdiobus_c45_read(struct mii_bus *bus, int prtad, int devad, return mdiobus_read(bus, prtad, mdiobus_c45_addr(devad, regnum)); } +static inline int mdiobus_c45_write(struct mii_bus *bus, int prtad, int devad, + u16 regnum, u16 val) +{ + return mdiobus_write(bus, prtad, mdiobus_c45_addr(devad, regnum), val); +} + int mdiobus_register_device(struct mdio_device *mdiodev); int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h new file mode 100644 index 000000000000..b1d27f7cd23f --- /dev/null +++ b/include/linux/mdio/mdio-i2c.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * MDIO I2C bridge + * + * Copyright (C) 2015 Russell King + */ +#ifndef MDIO_I2C_H +#define MDIO_I2C_H + +struct device; +struct i2c_adapter; +struct mii_bus; + +struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c); + +#endif diff --git a/include/linux/mdio/mdio-xgene.h b/include/linux/mdio/mdio-xgene.h new file mode 100644 index 000000000000..8af93ada8b64 --- /dev/null +++ b/include/linux/mdio/mdio-xgene.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Applied Micro X-Gene SoC MDIO Driver + * + * Copyright (c) 2016, Applied Micro Circuits Corporation + * Author: Iyappan Subramanian <isubramanian@apm.com> + */ + +#ifndef __MDIO_XGENE_H__ +#define __MDIO_XGENE_H__ + +#define BLOCK_XG_MDIO_CSR_OFFSET 0x5000 +#define BLOCK_DIAG_CSR_OFFSET 0xd000 +#define XGENET_CONFIG_REG_ADDR 0x20 + +#define MAC_ADDR_REG_OFFSET 0x00 +#define MAC_COMMAND_REG_OFFSET 0x04 +#define MAC_WRITE_REG_OFFSET 0x08 +#define MAC_READ_REG_OFFSET 0x0c +#define MAC_COMMAND_DONE_REG_OFFSET 0x10 + +#define CLKEN_OFFSET 0x08 +#define SRST_OFFSET 0x00 + +#define MENET_CFG_MEM_RAM_SHUTDOWN_ADDR 0x70 +#define MENET_BLOCK_MEM_RDY_ADDR 0x74 + +#define MAC_CONFIG_1_ADDR 0x00 +#define MII_MGMT_COMMAND_ADDR 0x24 +#define MII_MGMT_ADDRESS_ADDR 0x28 +#define MII_MGMT_CONTROL_ADDR 0x2c +#define MII_MGMT_STATUS_ADDR 0x30 +#define MII_MGMT_INDICATORS_ADDR 0x34 +#define SOFT_RESET BIT(31) + +#define MII_MGMT_CONFIG_ADDR 0x20 +#define MII_MGMT_COMMAND_ADDR 0x24 +#define MII_MGMT_ADDRESS_ADDR 0x28 +#define MII_MGMT_CONTROL_ADDR 0x2c +#define MII_MGMT_STATUS_ADDR 0x30 +#define MII_MGMT_INDICATORS_ADDR 0x34 + +#define MIIM_COMMAND_ADDR 0x20 +#define MIIM_FIELD_ADDR 0x24 +#define MIIM_CONFIGURATION_ADDR 0x28 +#define MIIM_LINKFAILVECTOR_ADDR 0x2c +#define MIIM_INDICATOR_ADDR 0x30 +#define MIIMRD_FIELD_ADDR 0x34 + +#define MDIO_CSR_OFFSET 0x5000 + +#define REG_ADDR_POS 0 +#define REG_ADDR_LEN 5 +#define PHY_ADDR_POS 8 +#define PHY_ADDR_LEN 5 + +#define HSTMIIMWRDAT_POS 0 +#define HSTMIIMWRDAT_LEN 16 +#define HSTPHYADX_POS 23 +#define HSTPHYADX_LEN 5 +#define HSTREGADX_POS 18 +#define HSTREGADX_LEN 5 +#define HSTLDCMD BIT(3) +#define HSTMIIMCMD_POS 0 +#define HSTMIIMCMD_LEN 3 + +#define BUSY_MASK BIT(0) +#define READ_CYCLE_MASK BIT(0) + +enum xgene_enet_cmd { + XGENE_ENET_WR_CMD = BIT(31), + XGENE_ENET_RD_CMD = BIT(30) +}; + +enum { + MIIM_CMD_IDLE, + MIIM_CMD_LEGACY_WRITE, + MIIM_CMD_LEGACY_READ, +}; + +enum xgene_mdio_id { + XGENE_MDIO_RGMII = 1, + XGENE_MDIO_XFI +}; + +struct xgene_mdio_pdata { + struct clk *clk; + struct device *dev; + void __iomem *mac_csr_addr; + void __iomem *diag_csr_addr; + void __iomem *mdio_csr_addr; + struct mii_bus *mdio_bus; + int mdio_id; + spinlock_t mac_lock; /* mac lock */ +}; + +/* Set the specified value into a bit-field defined by its starting position + * and length within a single u64. + */ +static inline u64 xgene_enet_set_field_value(int pos, int len, u64 val) +{ + return (val & ((1ULL << len) - 1)) << pos; +} + +#define SET_VAL(field, val) \ + xgene_enet_set_field_value(field ## _POS, field ## _LEN, val) + +#define SET_BIT(field) \ + xgene_enet_set_field_value(field ## _POS, 1, 1) + +/* Get the value from a bit-field defined by its starting position + * and length within the specified u64. + */ +static inline u64 xgene_enet_get_field_value(int pos, int len, u64 src) +{ + return (src >> pos) & ((1ULL << len) - 1); +} + +#define GET_VAL(field, src) \ + xgene_enet_get_field_value(field ## _POS, field ## _LEN, src) + +#define GET_BIT(field, src) \ + xgene_enet_get_field_value(field ## _POS, 1, src) + +u32 xgene_mdio_rd_mac(struct xgene_mdio_pdata *pdata, u32 rd_addr); +void xgene_mdio_wr_mac(struct xgene_mdio_pdata *pdata, u32 wr_addr, u32 data); +int xgene_mdio_rgmii_read(struct mii_bus *bus, int phy_id, int reg); +int xgene_mdio_rgmii_write(struct mii_bus *bus, int phy_id, int reg, u16 data); +struct phy_device *xgene_enet_phy_register(struct mii_bus *bus, int phy_addr); + +#endif /* __MDIO_XGENE_H__ */ diff --git a/include/linux/net.h b/include/linux/net.h index d48ff1180879..7657c6432a69 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,8 @@ struct net; #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define PROTO_CMSG_DATA_ONLY 0x0001 + #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types @@ -135,6 +137,7 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, struct proto_ops { int family; + unsigned int flags; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0e303f6603f..7f9fcfd15942 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -618,7 +618,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS - struct xdp_umem *umem; + struct xsk_buff_pool *pool; #endif /* * write-mostly part @@ -640,10 +640,14 @@ struct netdev_queue { extern int sysctl_fb_tunnels_only_for_init_net; extern int sysctl_devconf_inherit_init_net; +/* + * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns + * == 1 : For initns only + * == 2 : For none. + */ static inline bool net_has_fallback_tunnels(const struct net *net) { - return net == &init_net || - !IS_ENABLED(CONFIG_SYSCTL) || + return (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1) || !sysctl_fb_tunnels_only_for_init_net; } @@ -751,7 +755,7 @@ struct netdev_rx_queue { struct net_device *dev; struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_XDP_SOCKETS - struct xdp_umem *umem; + struct xsk_buff_pool *pool; #endif } ____cacheline_aligned_in_smp; @@ -879,7 +883,7 @@ enum bpf_netdev_command { /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, - XDP_SETUP_XSK_UMEM, + XDP_SETUP_XSK_POOL, }; struct bpf_prog_offload_ops; @@ -913,9 +917,9 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_SETUP_XSK_POOL */ struct { - struct xdp_umem *umem; + struct xsk_buff_pool *pool; u16 queue_id; } xsk; }; @@ -2193,6 +2197,22 @@ int netdev_get_num_tc(struct net_device *dev) return dev->num_tc; } +static inline void net_prefetch(void *p) +{ + prefetch(p); +#if L1_CACHE_BYTES < 128 + prefetch((u8 *)p + L1_CACHE_BYTES); +#endif +} + +static inline void net_prefetchw(void *p) +{ + prefetchw(p); +#if L1_CACHE_BYTES < 128 + prefetchw((u8 *)p + L1_CACHE_BYTES); +#endif +} + void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev); int netdev_bind_sb_channel_queue(struct net_device *dev, diff --git a/include/linux/pcs-lynx.h b/include/linux/pcs-lynx.h new file mode 100644 index 000000000000..a6440d6ebe95 --- /dev/null +++ b/include/linux/pcs-lynx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* Copyright 2020 NXP + * Lynx PCS helpers + */ + +#ifndef __LINUX_PCS_LYNX_H +#define __LINUX_PCS_LYNX_H + +#include <linux/mdio.h> +#include <linux/phylink.h> + +struct lynx_pcs { + struct phylink_pcs pcs; + struct mdio_device *mdio; +}; + +struct lynx_pcs *lynx_pcs_create(struct mdio_device *mdio); + +void lynx_pcs_destroy(struct lynx_pcs *pcs); + +#endif /* __LINUX_PCS_LYNX_H */ diff --git a/include/linux/mdio-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 9a841aa5982d..351c1c9aedc5 100644 --- a/include/linux/mdio-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -4,8 +4,8 @@ * Synopsys DesignWare XPCS helpers */ -#ifndef __LINUX_MDIO_XPCS_H -#define __LINUX_MDIO_XPCS_H +#ifndef __LINUX_PCS_XPCS_H +#define __LINUX_PCS_XPCS_H #include <linux/phy.h> #include <linux/phylink.h> @@ -29,7 +29,7 @@ struct mdio_xpcs_ops { int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface); }; -#if IS_ENABLED(CONFIG_MDIO_XPCS) +#if IS_ENABLED(CONFIG_PCS_XPCS) struct mdio_xpcs_ops *mdio_xpcs_get_ops(void); #else static inline struct mdio_xpcs_ops *mdio_xpcs_get_ops(void) @@ -38,4 +38,4 @@ static inline struct mdio_xpcs_ops *mdio_xpcs_get_ops(void) } #endif -#endif /* __LINUX_MDIO_XPCS_H */ +#endif /* __LINUX_PCS_XPCS_H */ diff --git a/include/linux/phylink.h b/include/linux/phylink.h index c36fb41a7d90..d81a714cfbbd 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -490,4 +490,7 @@ void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs); void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs, struct phylink_link_state *state); + +void phylink_decode_usxgmii_word(struct phylink_link_state *state, + uint16_t lpa); #endif diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index dd00fa41f7e7..8437307cca8c 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -36,7 +36,6 @@ #define OFF_PTP_SOURCE_UUID 22 /* PTPv1 only */ #define OFF_PTP_SEQUENCE_ID 30 -#define OFF_PTP_CONTROL 32 /* PTPv1 only */ /* Below defines should actually be removed at some point in time. */ #define IP6_HLEN 40 @@ -44,6 +43,30 @@ #define OFF_IHL 14 #define IPV4_HLEN(data) (((struct iphdr *)(data + OFF_IHL))->ihl << 2) +struct clock_identity { + u8 id[8]; +} __packed; + +struct port_identity { + struct clock_identity clock_identity; + __be16 port_number; +} __packed; + +struct ptp_header { + u8 tsmt; /* transportSpecific | messageType */ + u8 ver; /* reserved | versionPTP */ + __be16 message_length; + u8 domain_number; + u8 reserved1; + u8 flag_field[2]; + __be64 correction; + __be32 reserved2; + struct port_identity source_port_identity; + __be16 sequence_id; + u8 control; + u8 log_message_interval; +} __packed; + #if defined(CONFIG_NET_PTP_CLASSIFY) /** * ptp_classify_raw - classify a PTP packet @@ -57,6 +80,46 @@ */ unsigned int ptp_classify_raw(const struct sk_buff *skb); +/** + * ptp_parse_header - Get pointer to the PTP v2 header + * @skb: packet buffer + * @type: type of the packet (see ptp_classify_raw()) + * + * This function takes care of the VLAN, UDP, IPv4 and IPv6 headers. The length + * is checked. + * + * Note, internally skb_mac_header() is used. Make sure that the @skb is + * initialized accordingly. + * + * Return: Pointer to the ptp v2 header or NULL if not found + */ +struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type); + +/** + * ptp_get_msgtype - Extract ptp message type from given header + * @hdr: ptp header + * @type: type of the packet (see ptp_classify_raw()) + * + * This function returns the message type for a given ptp header. It takes care + * of the different ptp header versions (v1 or v2). + * + * Return: The message type + */ +static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, + unsigned int type) +{ + u8 msgtype; + + if (unlikely(type & PTP_CLASS_V1)) { + /* msg type is located at the control field for ptp v1 */ + msgtype = hdr->control; + } else { + msgtype = hdr->tsmt & 0x0f; + } + + return msgtype; +} + void __init ptp_classifier_init(void); #else static inline void ptp_classifier_init(void) @@ -66,5 +129,10 @@ static inline unsigned int ptp_classify_raw(struct sk_buff *skb) { return PTP_CLASS_NONE; } +static inline struct ptp_header *ptp_parse_header(struct sk_buff *skb, + unsigned int type) +{ + return NULL; +} #endif #endif /* _PTP_CLASSIFY_H_ */ diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index cd6a5c7e56eb..56fa55841d39 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -21,6 +21,7 @@ #include <linux/qed/common_hsi.h> #include <linux/qed/qed_chain.h> #include <linux/io-64-nonatomic-lo-hi.h> +#include <net/devlink.h> enum dcbx_protocol_type { DCBX_PROTOCOL_ISCSI, @@ -779,6 +780,11 @@ enum qed_nvm_flash_cmd { QED_NVM_FLASH_CMD_NVM_MAX, }; +struct qed_devlink { + struct qed_dev *cdev; + struct devlink_health_reporter *fw_reporter; +}; + struct qed_common_cb_ops { void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); void (*link_update)(void *dev, struct qed_link_output *link); @@ -844,10 +850,9 @@ struct qed_common_ops { struct qed_dev* (*probe)(struct pci_dev *dev, struct qed_probe_params *params); - void (*remove)(struct qed_dev *cdev); + void (*remove)(struct qed_dev *cdev); - int (*set_power_state)(struct qed_dev *cdev, - pci_power_t state); + int (*set_power_state)(struct qed_dev *cdev, pci_power_t state); void (*set_name) (struct qed_dev *cdev, char name[]); @@ -855,50 +860,51 @@ struct qed_common_ops { * PF params required for the call before slowpath_start is * documented within the qed_pf_params structure definition. */ - void (*update_pf_params)(struct qed_dev *cdev, - struct qed_pf_params *params); - int (*slowpath_start)(struct qed_dev *cdev, - struct qed_slowpath_params *params); + void (*update_pf_params)(struct qed_dev *cdev, + struct qed_pf_params *params); - int (*slowpath_stop)(struct qed_dev *cdev); + int (*slowpath_start)(struct qed_dev *cdev, + struct qed_slowpath_params *params); + + int (*slowpath_stop)(struct qed_dev *cdev); /* Requests to use `cnt' interrupts for fastpath. * upon success, returns number of interrupts allocated for fastpath. */ - int (*set_fp_int)(struct qed_dev *cdev, - u16 cnt); + int (*set_fp_int)(struct qed_dev *cdev, u16 cnt); /* Fills `info' with pointers required for utilizing interrupts */ - int (*get_fp_int)(struct qed_dev *cdev, - struct qed_int_info *info); - - u32 (*sb_init)(struct qed_dev *cdev, - struct qed_sb_info *sb_info, - void *sb_virt_addr, - dma_addr_t sb_phy_addr, - u16 sb_id, - enum qed_sb_type type); - - u32 (*sb_release)(struct qed_dev *cdev, - struct qed_sb_info *sb_info, - u16 sb_id, - enum qed_sb_type type); - - void (*simd_handler_config)(struct qed_dev *cdev, - void *token, - int index, - void (*handler)(void *)); - - void (*simd_handler_clean)(struct qed_dev *cdev, - int index); - int (*dbg_grc)(struct qed_dev *cdev, - void *buffer, u32 *num_dumped_bytes); + int (*get_fp_int)(struct qed_dev *cdev, struct qed_int_info *info); + + u32 (*sb_init)(struct qed_dev *cdev, + struct qed_sb_info *sb_info, + void *sb_virt_addr, + dma_addr_t sb_phy_addr, + u16 sb_id, + enum qed_sb_type type); + + u32 (*sb_release)(struct qed_dev *cdev, + struct qed_sb_info *sb_info, + u16 sb_id, + enum qed_sb_type type); + + void (*simd_handler_config)(struct qed_dev *cdev, + void *token, + int index, + void (*handler)(void *)); + + void (*simd_handler_clean)(struct qed_dev *cdev, int index); + + int (*dbg_grc)(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes); int (*dbg_grc_size)(struct qed_dev *cdev); - int (*dbg_all_data) (struct qed_dev *cdev, void *buffer); + int (*dbg_all_data)(struct qed_dev *cdev, void *buffer); - int (*dbg_all_data_size) (struct qed_dev *cdev); + int (*dbg_all_data_size)(struct qed_dev *cdev); + + int (*report_fatal_error)(struct devlink *devlink, + enum qed_hw_err_type err_type); /** * @brief can_link_change - can the instance change the link or not @@ -1137,6 +1143,10 @@ struct qed_common_ops { * */ int (*set_grc_config)(struct qed_dev *cdev, u32 cfg_id, u32 val); + + struct devlink* (*devlink_register)(struct qed_dev *cdev); + + void (*devlink_unregister)(struct devlink *devlink); }; #define MASK_FIELD(_name, _value) \ diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index d9015aac78c6..aaaac8ac927c 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -82,7 +82,14 @@ static inline void rcu_read_unlock_trace(void) void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); void synchronize_rcu_tasks_trace(void); void rcu_barrier_tasks_trace(void); - +#else +/* + * The BPF JIT forms these addresses even when it doesn't call these + * functions, so provide definitions that result in runtime errors. + */ +static inline void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) { BUG(); } +static inline void rcu_read_lock_trace(void) { BUG(); } +static inline void rcu_read_unlock_trace(void) { BUG(); } #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ #endif /* __LINUX_RCUPDATE_TRACE_H */ diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 1e9ed840b9fc..3119928fc103 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -340,23 +340,6 @@ static inline void sk_psock_update_proto(struct sock *sk, struct sk_psock *psock, struct proto *ops) { - /* Initialize saved callbacks and original proto only once, since this - * function may be called multiple times for a psock, e.g. when - * psock->progs.msg_parser is updated. - * - * Since we've not installed the new proto, psock is not yet in use and - * we can initialize it without synchronization. - */ - if (!psock->sk_proto) { - struct proto *orig = READ_ONCE(sk->sk_prot); - - psock->saved_unhash = orig->unhash; - psock->saved_close = orig->close; - psock->saved_write_space = sk->sk_write_space; - - psock->sk_proto = orig; - } - /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, ops); } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 14b62d7df942..56ff2952edaf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -92,6 +92,8 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ + u8 saw_unknown:1, /* Received unknown option */ + unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -237,14 +239,13 @@ struct tcp_sock { repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; - u8 syn_data:1, /* SYN includes data */ + u8 save_syn:2, /* Save headers of SYN packet */ + syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ @@ -391,6 +392,9 @@ struct tcp_sock { #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif +#if IS_ENABLED(CONFIG_SMC) + bool syn_smc; /* SYN includes SMC */ +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ @@ -406,7 +410,7 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; - u32 *saved_syn; + struct saved_syn *saved_syn; }; enum tsq_enum { @@ -484,6 +488,12 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } +static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) +{ + return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb); diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 5036c94c0503..119f4c9c3a9c 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -3,13 +3,27 @@ #ifndef _BPF_SK_STORAGE_H #define _BPF_SK_STORAGE_H +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <net/sock.h> +#include <uapi/linux/sock_diag.h> +#include <uapi/linux/btf.h> +#include <linux/bpf_local_storage.h> + struct sock; void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto sk_storage_get_btf_proto; +extern const struct bpf_func_proto sk_storage_delete_btf_proto; +struct bpf_local_storage_elem; struct bpf_sk_storage_diag; struct sk_buff; struct nlattr; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d9e6b9fbd95b..c9bce9bba511 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -678,7 +678,10 @@ struct cfg80211_bitrate_mask { u32 legacy; u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; + u16 he_mcs[NL80211_HE_NSS_MAX]; enum nl80211_txrate_gi gi; + enum nl80211_he_gi he_gi; + enum nl80211_he_ltf he_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/net/dst.h b/include/net/dst.h index 6ae2e625050d..8ea8812b0b41 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -214,7 +214,7 @@ dst_allfrag(const struct dst_entry *dst) static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { - return dst_metric(dst, RTAX_LOCK) & (1<<metric); + return dst_metric(dst, RTAX_LOCK) & (1 << metric); } static inline void dst_hold(struct dst_entry *dst) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index aa8893c68c50..c738abeb3265 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -86,6 +86,8 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_rto_min; + __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a3702d1d4875..89163ef8cf4b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -296,13 +296,6 @@ static inline void __inet_sk_copy_descendant(struct sock *sk_to, memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, sk_from->sk_prot->obj_size - ancestor_size); } -#if !(IS_ENABLED(CONFIG_IPV6)) -static inline void inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock)); -} -#endif int inet_sk_rebuild_header(struct sock *sk); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index d7a7f7c81e7b..8fce558b5fea 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -63,6 +63,9 @@ struct ipv6_stub { int encap_type); #endif struct neigh_table *nd_tbl; + + int (*ipv6_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)); }; extern const struct ipv6_stub *ipv6_stub __read_mostly; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 66e2bfd165e8..ec148b3e9c41 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3736,7 +3736,7 @@ enum ieee80211_reconfig_type { * decremented, and when they reach 1 the driver must call * ieee80211_csa_finish(). Drivers which use ieee80211_beacon_get() * get the csa counter decremented by mac80211, but must check if it is - * 1 using ieee80211_csa_is_complete() after the beacon has been + * 1 using ieee80211_beacon_counter_is_complete() after the beacon has been * transmitted and then call ieee80211_csa_finish(). * If the CSA count starts as zero or 1, this function will not be called, * since there won't be any time to beacon before the switch anyway. @@ -4763,21 +4763,21 @@ void ieee80211_tx_status_8023(struct ieee80211_hw *hw, */ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); -#define IEEE80211_MAX_CSA_COUNTERS_NUM 2 +#define IEEE80211_MAX_CNTDWN_COUNTERS_NUM 2 /** * struct ieee80211_mutable_offsets - mutable beacon offsets * @tim_offset: position of TIM element * @tim_length: size of TIM element - * @csa_counter_offs: array of IEEE80211_MAX_CSA_COUNTERS_NUM offsets - * to CSA counters. This array can contain zero values which + * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets + * to countdown counters. This array can contain zero values which * should be ignored. */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; - u16 csa_counter_offs[IEEE80211_MAX_CSA_COUNTERS_NUM]; + u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; }; /** @@ -4846,31 +4846,31 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, } /** - * ieee80211_csa_update_counter - request mac80211 to decrement the csa counter + * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * - * The csa counter should be updated after each beacon transmission. + * The beacon counter should be updated after each beacon transmission. * This function is called implicitly when * ieee80211_beacon_get/ieee80211_beacon_get_tim are called, however if the * beacon frames are generated by the device, the driver should call this - * function after each beacon transmission to sync mac80211's csa counters. + * function after each beacon transmission to sync mac80211's beacon countdown. * - * Return: new csa counter value + * Return: new countdown value */ -u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif); +u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif); /** - * ieee80211_csa_set_counter - request mac80211 to set csa counter + * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @counter: the new value for the counter * - * The csa counter can be changed by the device, this API should be + * The beacon countdown can be changed by the device, this API should be * used by the device driver to update csa counter in mac80211. * - * It should never be used together with ieee80211_csa_update_counter(), + * It should never be used together with ieee80211_beacon_update_cntdwn(), * as it will cause a race condition around the counter value. */ -void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter); +void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter); /** * ieee80211_csa_finish - notify mac80211 about channel switch @@ -4883,13 +4883,12 @@ void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter); void ieee80211_csa_finish(struct ieee80211_vif *vif); /** - * ieee80211_csa_is_complete - find out if counters reached 1 + * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1 * @vif: &struct ieee80211_vif pointer from the add_interface callback. * - * This function returns whether the channel switch counters reached zero. + * This function returns whether the countdown reached zero. */ -bool ieee80211_csa_is_complete(struct ieee80211_vif *vif); - +bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif); /** * ieee80211_proberesp_get - retrieve a Probe Response template diff --git a/include/net/netlink.h b/include/net/netlink.h index c0411f14fb53..fdd317f8fde4 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -181,8 +181,6 @@ enum { NLA_S64, NLA_BITFIELD32, NLA_REJECT, - NLA_EXACT_LEN, - NLA_MIN_LEN, __NLA_TYPE_MAX, }; @@ -199,11 +197,11 @@ struct netlink_range_validation_signed { enum nla_policy_validation { NLA_VALIDATE_NONE, NLA_VALIDATE_RANGE, + NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, - NLA_VALIDATE_WARN_TOO_LONG, }; /** @@ -222,7 +220,7 @@ enum nla_policy_validation { * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload - * NLA_MIN_LEN Minimum length of attribute payload + * (but see also below with the validation type) * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if @@ -237,11 +235,6 @@ enum nla_policy_validation { * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused - * NLA_EXACT_LEN Attribute should have exactly this length, otherwise - * it is rejected or warned about, the latter happening - * if and only if the `validation_type' is set to - * NLA_VALIDATE_WARN_TOO_LONG. - * NLA_MIN_LEN Minimum length of attribute payload * All other Minimum length of attribute payload * * Meaning of validation union: @@ -296,6 +289,11 @@ enum nla_policy_validation { * pointer to a struct netlink_range_validation_signed * that indicates the min/max values. * Use NLA_POLICY_FULL_RANGE_SIGNED(). + * + * NLA_BINARY If the validation type is like the ones for integers + * above, then the min/max length (not value like for + * integers) of the attribute is enforced. + * * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: @@ -309,7 +307,7 @@ enum nla_policy_validation { * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, - * [ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) }, + * [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)), * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ @@ -335,9 +333,10 @@ struct nla_policy { * nesting validation starts here. * * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT - * for any types >= this, so need to use NLA_MIN_LEN to get the - * previous pure { .len = xyz } behaviour. The advantage of this - * is that types not specified in the policy will be rejected. + * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to + * get the previous pure { .len = xyz } behaviour. The advantage + * of this is that types not specified in the policy will be + * rejected. * * For completely new families it should be set to 1 so that the * validation is enforced for all attributes. For existing ones @@ -349,12 +348,6 @@ struct nla_policy { }; }; -#define NLA_POLICY_EXACT_LEN(_len) { .type = NLA_EXACT_LEN, .len = _len } -#define NLA_POLICY_EXACT_LEN_WARN(_len) \ - { .type = NLA_EXACT_LEN, .len = _len, \ - .validation_type = NLA_VALIDATE_WARN_TOO_LONG, } -#define NLA_POLICY_MIN_LEN(_len) { .type = NLA_MIN_LEN, .len = _len } - #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) @@ -370,19 +363,21 @@ struct nla_policy { { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) -#define NLA_ENSURE_UINT_TYPE(tp) \ +#define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_U8 || tp == NLA_U16 || \ tp == NLA_U32 || tp == NLA_U64 || \ - tp == NLA_MSECS) + tp) + tp == NLA_MSECS || \ + tp == NLA_BINARY) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_S16 || \ tp == NLA_S32 || tp == NLA_S64) + tp) -#define NLA_ENSURE_INT_TYPE(tp) \ +#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_U8 || \ tp == NLA_S16 || tp == NLA_U16 || \ tp == NLA_S32 || tp == NLA_U32 || \ tp == NLA_S64 || tp == NLA_U64 || \ - tp == NLA_MSECS) + tp) + tp == NLA_MSECS || \ + tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ (__NLA_ENSURE(tp != NLA_BITFIELD32 && \ tp != NLA_REJECT && \ @@ -390,14 +385,14 @@ struct nla_policy { tp != NLA_NESTED_ARRAY) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ - .type = NLA_ENSURE_INT_TYPE(tp), \ + .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE, \ .min = _min, \ .max = _max \ } #define NLA_POLICY_FULL_RANGE(tp, _range) { \ - .type = NLA_ENSURE_UINT_TYPE(tp), \ + .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range = _range, \ } @@ -409,13 +404,13 @@ struct nla_policy { } #define NLA_POLICY_MIN(tp, _min) { \ - .type = NLA_ENSURE_INT_TYPE(tp), \ + .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MIN, \ .min = _min, \ } #define NLA_POLICY_MAX(tp, _max) { \ - .type = NLA_ENSURE_INT_TYPE(tp), \ + .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ } @@ -427,6 +422,15 @@ struct nla_policy { .len = __VA_ARGS__ + 0, \ } +#define NLA_POLICY_EXACT_LEN(_len) NLA_POLICY_RANGE(NLA_BINARY, _len, _len) +#define NLA_POLICY_EXACT_LEN_WARN(_len) { \ + .type = NLA_BINARY, \ + .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG, \ + .min = _len, \ + .max = _len \ +} +#define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) + /** * struct nl_info - netlink source information * @nlh: Netlink message header of original request diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b2eb8b4ba697..29e41ff3ec93 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -41,6 +41,13 @@ struct request_sock_ops { int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); +struct saved_syn { + u32 mac_hdrlen; + u32 network_hdrlen; + u32 tcp_hdrlen; + u8 data[]; +}; + /* struct request_sock - mini sock to represent a connection request */ struct request_sock { @@ -60,7 +67,7 @@ struct request_sock { struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; struct sock *sk; - u32 *saved_syn; + struct saved_syn *saved_syn; u32 secid; u32 peer_secid; }; diff --git a/include/net/sock.h b/include/net/sock.h index 064637d1ddf6..7dd3051551fb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -246,7 +246,7 @@ struct sock_common { /* public: */ }; -struct bpf_sk_storage; +struct bpf_local_storage; /** * struct sock - network layer representation of sockets @@ -517,7 +517,7 @@ struct sock { void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL - struct bpf_sk_storage __rcu *sk_bpf_storage; + struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; }; @@ -1478,7 +1478,7 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) { if (!sk_has_account(sk)) return true; - return size<= sk->sk_forward_alloc || + return size <= sk->sk_forward_alloc || __sk_mem_schedule(sk, size, SK_MEM_RECV) || skb_pfmemalloc(skb); } diff --git a/include/net/tcp.h b/include/net/tcp.h index eab6c7510b5b..e85d564446c6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -394,7 +394,7 @@ void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); -void tcp_init_transfer(struct sock *sk, int bpf_op); +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, @@ -455,7 +455,8 @@ enum tcp_synack_type { struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -699,7 +700,7 @@ static inline void tcp_fast_path_check(struct sock *sk) static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; + u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); @@ -941,16 +942,6 @@ INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb)); #endif -static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_tcp_l3mdev_accept && - skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) - return true; -#endif - return false; -} - /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v4_sdif(struct sk_buff *skb) { @@ -2035,7 +2026,8 @@ struct tcp_request_sock_ops { int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; @@ -2233,6 +2225,55 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#ifdef CONFIG_CGROUP_BPF +/* Copy the listen sk's HDR_OPT_CB flags to its child. + * + * During 3-Way-HandShake, the synack is usually sent from + * the listen sk with the HDR_OPT_CB flags set so that + * bpf-prog will be called to write the BPF hdr option. + * + * In fastopen, the child sk is used to send synack instead + * of the listen sk. Thus, inheriting the HDR_OPT_CB flags + * from the listen sk gives the bpf-prog a chance to write + * BPF hdr option in the synack pkt during fastopen. + * + * Both fastopen and non-fastopen child will inherit the + * HDR_OPT_CB flags to keep the bpf-prog having a consistent + * behavior when deciding to clear this cb flags (or not) + * during the PASSIVE_ESTABLISHED_CB. + * + * In the future, other cb flags could be inherited here also. + */ +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ + tcp_sk(child)->bpf_sock_ops_cb_flags = + tcp_sk(sk)->bpf_sock_ops_cb_flags & + (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ + skops->skb = skb; + skops->skb_data_end = skb->data + end_offset; +} +#else +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ +} +#endif + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index c9d87cc40c11..1a9559c0cbdd 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -18,25 +18,19 @@ struct xsk_queue; struct xdp_buff; struct xdp_umem { - struct xsk_queue *fq; - struct xsk_queue *cq; - struct xsk_buff_pool *pool; + void *addrs; u64 size; u32 headroom; u32 chunk_size; + u32 chunks; + u32 npgs; struct user_struct *user; refcount_t users; - struct work_struct work; - struct page **pgs; - u32 npgs; - u16 queue_id; - u8 need_wakeup; u8 flags; - int id; - struct net_device *dev; bool zc; - spinlock_t xsk_tx_list_lock; - struct list_head xsk_tx_list; + struct page **pgs; + int id; + struct list_head xsk_dma_list; }; struct xsk_map { @@ -48,10 +42,11 @@ struct xsk_map { struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; - struct xsk_queue *rx; + struct xsk_queue *rx ____cacheline_aligned_in_smp; struct net_device *dev; struct xdp_umem *umem; struct list_head flush_node; + struct xsk_buff_pool *pool; u16 queue_id; bool zc; enum { @@ -59,10 +54,9 @@ struct xdp_sock { XSK_BOUND, XSK_UNBOUND, } state; - /* Protects multiple processes in the control path */ - struct mutex mutex; + struct xsk_queue *tx ____cacheline_aligned_in_smp; - struct list_head list; + struct list_head tx_list; /* Mutual exclusion of NAPI TX thread and sendmsg error paths * in the SKB destructor callback. */ @@ -77,6 +71,10 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + /* Protects multiple processes in the control path */ + struct mutex mutex; + struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ + struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; #ifdef CONFIG_XDP_SOCKETS diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index ccf848f7efa4..5b1ee8a9976d 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -11,47 +11,50 @@ #ifdef CONFIG_XDP_SOCKETS -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); -void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); -void xsk_set_rx_need_wakeup(struct xdp_umem *umem); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); +void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); +bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); +void xsk_tx_release(struct xsk_buff_pool *pool); +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, + u16 queue_id); +void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool); +bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool); -static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { - return XDP_PACKET_HEADROOM + umem->headroom; + return XDP_PACKET_HEADROOM + pool->headroom; } -static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { - return umem->chunk_size; + return pool->chunk_size; } -static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { - return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); + return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } -static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, +static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { - xp_set_rxq_info(umem->pool, rxq); + xp_set_rxq_info(pool, rxq); } -static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, +static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { - xp_dma_unmap(umem->pool, attrs); + xp_dma_unmap(pool, attrs); } -static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, - unsigned long attrs) +static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, + struct device *dev, unsigned long attrs) { - return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); + struct xdp_umem *umem = pool->umem; + + return xp_dma_map(pool, dev, attrs, umem->pgs, umem->npgs); } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) @@ -68,14 +71,14 @@ static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) return xp_get_frame_dma(xskb); } -static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { - return xp_alloc(umem->pool); + return xp_alloc(pool); } -static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { - return xp_can_alloc(umem->pool, count); + return xp_can_alloc(pool, count); } static inline void xsk_buff_free(struct xdp_buff *xdp) @@ -85,100 +88,104 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) xp_free(xskb); } -static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, + u64 addr) { - return xp_raw_get_dma(umem->pool, addr); + return xp_raw_get_dma(pool, addr); } -static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { - return xp_raw_get_data(umem->pool, addr); + return xp_raw_get_data(pool, addr); } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + if (!pool->dma_need_sync) + return; + xp_dma_sync_for_cpu(xskb); } -static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, +static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - xp_dma_sync_for_device(umem->pool, dma, size); + xp_dma_sync_for_device(pool, dma, size); } #else -static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { } -static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, - struct xdp_desc *desc) +static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { return false; } -static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } -static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, - u16 queue_id) +static inline struct xsk_buff_pool * +xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } -static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +static inline bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return false; } -static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return 0; } -static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return 0; } -static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return 0; } -static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, +static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { } -static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, +static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { } -static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, - unsigned long attrs) +static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, + struct device *dev, unsigned long attrs) { return 0; } @@ -193,12 +200,12 @@ static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) return 0; } -static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return NULL; } -static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; } @@ -207,21 +214,22 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } -static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, + u64 addr) { return 0; } -static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return NULL; } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } -static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, +static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 6842990e2712..0140d086dc84 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -13,6 +13,8 @@ struct xsk_buff_pool; struct xdp_rxq_info; struct xsk_queue; struct xdp_desc; +struct xdp_umem; +struct xdp_sock; struct device; struct page; @@ -26,34 +28,68 @@ struct xdp_buff_xsk { struct list_head free_list_node; }; +struct xsk_dma_map { + dma_addr_t *dma_pages; + struct device *dev; + struct net_device *netdev; + refcount_t users; + struct list_head list; /* Protected by the RTNL_LOCK */ + u32 dma_pages_cnt; + bool dma_need_sync; +}; + struct xsk_buff_pool { - struct xsk_queue *fq; + /* Members only used in the control path first. */ + struct device *dev; + struct net_device *netdev; + struct list_head xsk_tx_list; + /* Protects modifications to the xsk_tx_list */ + spinlock_t xsk_tx_list_lock; + refcount_t users; + struct xdp_umem *umem; + struct work_struct work; struct list_head free_list; + u32 heads_cnt; + u16 queue_id; + + /* Data path members as close to free_heads at the end as possible. */ + struct xsk_queue *fq ____cacheline_aligned_in_smp; + struct xsk_queue *cq; + /* For performance reasons, each buff pool has its own array of dma_pages + * even when they are identical. + */ dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; u64 chunk_mask; u64 addrs_cnt; u32 free_list_cnt; u32 dma_pages_cnt; - u32 heads_cnt; u32 free_heads_cnt; u32 headroom; u32 chunk_size; u32 frame_len; + u8 cached_need_wakeup; + bool uses_need_wakeup; bool dma_need_sync; bool unaligned; void *addrs; - struct device *dev; struct xdp_buff_xsk *free_heads[]; }; /* AF_XDP core. */ -struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, - u32 chunk_size, u32 headroom, u64 size, - bool unaligned); -void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); +struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, + struct xdp_umem *umem); +int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, + u16 queue_id, u16 flags); +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, + struct net_device *dev, u16 queue_id); void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); +void xp_get_pool(struct xsk_buff_pool *pool); +void xp_put_pool(struct xsk_buff_pool *pool); +void xp_clear_dev(struct xsk_buff_pool *pool); +void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); +void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); @@ -80,9 +116,6 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - if (!xskb->pool->dma_need_sync) - return; - xp_dma_sync_for_cpu_slow(xskb); } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b6238b2209b7..8dda13880957 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, }; /* Note that tracing related programs such as @@ -345,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * @@ -2807,7 +2816,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * @@ -2823,6 +2832,9 @@ union bpf_attr { * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used @@ -2835,7 +2847,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -3395,6 +3407,175 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given 'struct path' object, which + * needs to be the kernel BTF 'path' object. The path is + * returned in the provided buffer 'buf' of size 'sz' and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3539,6 +3720,13 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), \ + FN(inode_storage_get), \ + FN(inode_storage_delete), \ + FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -3648,9 +3836,13 @@ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; -/* BPF_FUNC_sk_storage_get flags */ +/* BPF_FUNC_<kernel_obj>_storage_get flags */ enum { - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ @@ -4071,6 +4263,15 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + struct { + __u32 map_id; + } map; + }; + } iter; struct { __u32 netns_ino; __u32 attach_type; @@ -4158,6 +4359,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -4166,8 +4397,51 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. @@ -4223,6 +4497,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4250,6 +4581,63 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index c7d66755d212..79f9191bbb24 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_GTP_H_ #define _UAPI_LINUX_GTP_H_ +#define GTP_GENL_MCGRP_NAME "gtp" + enum gtp_genl_cmds { GTP_CMD_NEWPDP, GTP_CMD_DELPDP, diff --git a/include/uapi/linux/if_pppol2tp.h b/include/uapi/linux/if_pppol2tp.h index 060b4d1f3129..a91044328bc9 100644 --- a/include/uapi/linux/if_pppol2tp.h +++ b/include/uapi/linux/if_pppol2tp.h @@ -75,7 +75,7 @@ struct pppol2tpv3in6_addr { }; /* Socket options: - * DEBUG - bitmask of debug message categories + * DEBUG - bitmask of debug message categories (not used) * SENDSEQ - 0 => don't send packets with sequence numbers * 1 => send packets with sequence numbers * RECVSEQ - 0 => receive packet sequence numbers are optional diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 5ba122c1949a..20ee93f0f876 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -160,6 +160,7 @@ enum { INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, INET_DIAG_CGROUP_ID, + INET_DIAG_SOCKOPT, __INET_DIAG_MAX, }; @@ -183,6 +184,23 @@ struct inet_diag_meminfo { __u32 idiag_tmem; }; +/* INET_DIAG_SOCKOPT */ + +struct inet_diag_sockopt { + __u8 recverr:1, + is_icsk:1, + freebind:1, + hdrincl:1, + mc_loop:1, + transparent:1, + mc_all:1, + nodefrag:1; + __u8 bind_address_no_port:1, + recverr_rfc4884:1, + defer_connect:1, + unused:5; +}; + /* INET_DIAG_VEGASINFO */ struct tcpvegas_info { diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 61158f5a1a5b..88a0d32b8c07 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -108,7 +108,7 @@ enum { L2TP_ATTR_VLAN_ID, /* u16 (not used) */ L2TP_ATTR_COOKIE, /* 0, 4 or 8 bytes */ L2TP_ATTR_PEER_COOKIE, /* 0, 4 or 8 bytes */ - L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags */ + L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags (not used) */ L2TP_ATTR_RECV_SEQ, /* u8 */ L2TP_ATTR_SEND_SEQ, /* u8 */ L2TP_ATTR_LNS_MODE, /* u8 */ @@ -177,7 +177,9 @@ enum l2tp_seqmode { }; /** - * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions + * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions. + * + * Unused. * * @L2TP_MSG_DEBUG: verbose debug (if compiled in) * @L2TP_MSG_CONTROL: userspace - kernel interface diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 631f3a997b3c..0584e0d349f0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -252,9 +252,13 @@ * DOC: SAE authentication offload * * By setting @NL80211_EXT_FEATURE_SAE_OFFLOAD flag drivers can indicate they - * support offloading SAE authentication for WPA3-Personal networks. In - * %NL80211_CMD_CONNECT the password for SAE should be specified using - * %NL80211_ATTR_SAE_PASSWORD. + * support offloading SAE authentication for WPA3-Personal networks in station + * mode. Similarly @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP flag can be set by + * drivers indicating the offload support in AP mode. + * + * The password for SAE should be specified using %NL80211_ATTR_SAE_PASSWORD in + * %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP for station and AP mode + * respectively. */ /** @@ -647,13 +651,9 @@ * authentication/association or not receiving a response from the AP. * Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as * well to remain backwards compatible. - * When establishing a security association, drivers that support 4 way - * handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when - * the 4 way handshake is completed successfully. * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself. - * When a security association was established with the new AP (e.g. if - * the FT protocol was used for roaming or the driver completed the 4 way - * handshake), this event should be followed by an + * When a security association was established on an 802.1X network using + * fast transition, this event should be followed by an * %NL80211_CMD_PORT_AUTHORIZED event. * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify * userspace that a connection was dropped by the AP or due to other @@ -1067,13 +1067,11 @@ * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously * configured PMK for the authenticator address identified by * %NL80211_ATTR_MAC. - * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way - * handshake was completed successfully by the driver. The BSSID is - * specified with %NL80211_ATTR_MAC. Drivers that support 4 way handshake - * offload should send this event after indicating 802.11 association with - * %NL80211_CMD_CONNECT or %NL80211_CMD_ROAM. If the 4 way handshake failed - * %NL80211_CMD_DISCONNECT should be indicated instead. - * + * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates an 802.1X FT roam was + * completed successfully. Drivers that support 4 way handshake offload + * should send this event after indicating 802.1X FT assocation with + * %NL80211_CMD_ROAM. If the 4 way handshake failed %NL80211_CMD_DISCONNECT + * should be indicated instead. * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request * and RX notification. This command is used both as a request to transmit * a control port frame and as a notification that a control port frame @@ -2082,10 +2080,10 @@ enum nl80211_commands { * operation). * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information * for the time while performing a channel switch. - * @NL80211_ATTR_CSA_C_OFF_BEACON: An array of offsets (u16) to the channel - * switch counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). - * @NL80211_ATTR_CSA_C_OFF_PRESP: An array of offsets (u16) to the channel - * switch counters in the probe response (%NL80211_ATTR_PROBE_RESP). + * @NL80211_ATTR_CNTDWN_OFFS_BEACON: An array of offsets (u16) to the channel + * switch or color change counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). + * @NL80211_ATTR_CNTDWN_OFFS_PRESP: An array of offsets (u16) to the channel + * switch or color change counters in the probe response (%NL80211_ATTR_PROBE_RESP). * * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32. * As specified in the &enum nl80211_rxmgmt_flags. @@ -2821,8 +2819,8 @@ enum nl80211_attrs { NL80211_ATTR_CH_SWITCH_COUNT, NL80211_ATTR_CH_SWITCH_BLOCK_TX, NL80211_ATTR_CSA_IES, - NL80211_ATTR_CSA_C_OFF_BEACON, - NL80211_ATTR_CSA_C_OFF_PRESP, + NL80211_ATTR_CNTDWN_OFFS_BEACON, + NL80211_ATTR_CNTDWN_OFFS_PRESP, NL80211_ATTR_RXMGMT_FLAGS, @@ -3009,6 +3007,8 @@ enum nl80211_attrs { #define NL80211_ATTR_MESH_PARAMS NL80211_ATTR_MESH_CONFIG #define NL80211_ATTR_IFACE_SOCKET_OWNER NL80211_ATTR_SOCKET_OWNER #define NL80211_ATTR_SAE_DATA NL80211_ATTR_AUTH_DATA +#define NL80211_ATTR_CSA_C_OFF_BEACON NL80211_ATTR_CNTDWN_OFFS_BEACON +#define NL80211_ATTR_CSA_C_OFF_PRESP NL80211_ATTR_CNTDWN_OFFS_PRESP /* * Allow user space programs to use #ifdef on new attributes by defining them @@ -3187,6 +3187,18 @@ enum nl80211_he_gi { }; /** + * enum nl80211_he_ltf - HE long training field + * @NL80211_RATE_INFO_HE_1xLTF: 3.2 usec + * @NL80211_RATE_INFO_HE_2xLTF: 6.4 usec + * @NL80211_RATE_INFO_HE_4xLTF: 12.8 usec + */ +enum nl80211_he_ltf { + NL80211_RATE_INFO_HE_1XLTF, + NL80211_RATE_INFO_HE_2XLTF, + NL80211_RATE_INFO_HE_4XLTF, +}; + +/** * enum nl80211_he_ru_alloc - HE RU allocation values * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation * @NL80211_RATE_INFO_HE_RU_ALLOC_52: 52-tone RU allocation @@ -4741,6 +4753,10 @@ enum nl80211_key_attributes { * @NL80211_TXRATE_VHT: VHT rates allowed for TX rate selection, * see &struct nl80211_txrate_vht * @NL80211_TXRATE_GI: configure GI, see &enum nl80211_txrate_gi + * @NL80211_TXRATE_HE: HE rates allowed for TX rate selection, + * see &struct nl80211_txrate_he + * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. + * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -4750,6 +4766,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HT, NL80211_TXRATE_VHT, NL80211_TXRATE_GI, + NL80211_TXRATE_HE, + NL80211_TXRATE_HE_GI, + NL80211_TXRATE_HE_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -4767,6 +4786,15 @@ struct nl80211_txrate_vht { __u16 mcs[NL80211_VHT_NSS_MAX]; }; +#define NL80211_HE_NSS_MAX 8 +/** + * struct nl80211_txrate_he - HE MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_he { + __u16 mcs[NL80211_HE_NSS_MAX]; +}; + enum nl80211_txrate_gi { NL80211_TXRATE_DEFAULT_GI, NL80211_TXRATE_FORCE_SGI, @@ -5821,6 +5849,9 @@ enum nl80211_feature_flags { * handshake with PSK in AP mode (PSK is passed as part of the start AP * command). * + * @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP: Device wants to do SAE authentication + * in AP mode (SAE password is passed as part of the start AP command). + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5878,6 +5909,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, |