diff options
Diffstat (limited to 'net/ipv4')
42 files changed, 2405 insertions, 887 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e64e59b536d3..60db5a6487cc 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -10,7 +10,7 @@ config IP_MULTICAST intend to participate in the MBONE, a high bandwidth network on top of the Internet which carries audio and video broadcasts. More information about the MBONE is on the WWW at - <http://www.savetz.com/mbone/>. For most people, it's safe to say N. + <https://www.savetz.com/mbone/>. For most people, it's safe to say N. config IP_ADVANCED_ROUTER bool "IP: advanced router" @@ -73,7 +73,7 @@ config IP_MULTIPLE_TABLES If you need more information, see the Linux Advanced Routing and Traffic Control documentation at - <http://lartc.org/howto/lartc.rpdb.html> + <https://lartc.org/howto/lartc.rpdb.html> If unsure, say N. @@ -280,7 +280,7 @@ config SYN_COOKIES continue to connect, even when your machine is under attack. There is no need for the legitimate users to change their TCP/IP software; SYN cookies work transparently to them. For technical information - about SYN cookies, check out <http://cr.yp.to/syncookies.html>. + about SYN cookies, check out <https://cr.yp.to/syncookies.html>. If you are SYN flooded, the source address reported by the kernel is likely to have been forged by the attacker; it is only reported as @@ -525,7 +525,7 @@ config TCP_CONG_HSTCP A modification to TCP's congestion control mechanism for use with large congestion windows. A table indicates how much to increase the congestion window by when an ACK is received. - For more detail see http://www.icir.org/floyd/hstcp.html + For more detail see https://www.icir.org/floyd/hstcp.html config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9e1a186a3671..5b77a46885b9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o netlink.o nexthop.o + metrics.o netlink.o nexthop.o udp_tunnel_stub.o obj-$(CONFIG_BPFILTER) += bpfilter/ @@ -29,6 +29,7 @@ gre-y := gre_demux.o obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +udp_tunnel-y := udp_tunnel_core.o udp_tunnel_nic.o obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 02aa5cb3a4fd..4307503a6f0b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -411,6 +411,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + if (!sk->sk_kern_sock) + BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); + /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -1040,8 +1043,6 @@ const struct proto_ops inet_stream_ops = { .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, @@ -1070,8 +1071,6 @@ const struct proto_ops inet_dgram_ops = { .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; @@ -1102,8 +1101,6 @@ static const struct proto_ops inet_sockraw_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; @@ -1432,10 +1429,6 @@ static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, return inet_gso_segment(skb, features); } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, - struct sk_buff *)); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; @@ -1608,8 +1601,6 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) return -EINVAL; } -INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 9063c6767d34..545b2640f019 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -21,8 +21,7 @@ void bpfilter_umh_cleanup(struct umd_info *info) } EXPORT_SYMBOL_GPL(bpfilter_umh_cleanup); -static int bpfilter_mbox_request(struct sock *sk, int optname, - char __user *optval, +static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen, bool is_set) { int err; @@ -52,20 +51,23 @@ out: return err; } -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { return bpfilter_mbox_request(sk, optname, optval, optlen, true); } -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, + char __user *user_optval, int __user *optlen) { - int len; + sockptr_t optval; + int err, len; if (get_user(len, optlen)) return -EFAULT; - + err = init_user_sockptr(&optval, user_optval, len); + if (err) + return err; return bpfilter_mbox_request(sk, optname, optval, len, false); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index a23094b050f8..2eb71579f4d2 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -10,9 +10,9 @@ * * The CIPSO draft specification can be found in the kernel's Documentation * directory as well as the following URL: - * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt + * https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt * The FIPS-188 specification can be found at the following URL: - * http://www.itl.nist.gov/fipspubs/fip188.htm + * https://www.itl.nist.gov/fipspubs/fip188.htm * * Author: Paul Moore <paul.moore@hp.com> */ @@ -283,7 +283,7 @@ static int cipso_v4_cache_check(const unsigned char *key, /** * cipso_v4_cache_add - Add an entry to the CIPSO cache - * @skb: the packet + * @cipso_ptr: pointer to CIPSO IP option * @secattr: the packet's security attributes * * Description: @@ -1535,6 +1535,7 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) /** * cipso_v4_validate - Validate a CIPSO option + * @skb: the packet * @option: the start of the option, on error it is set to point to the error * * Description: @@ -2066,7 +2067,7 @@ void cipso_v4_sock_delattr(struct sock *sk) /** * cipso_v4_req_delattr - Delete the CIPSO option from a request socket - * @reg: the request socket + * @req: the request socket * * Description: * Removes the CIPSO option from a request socket, if present. @@ -2158,6 +2159,7 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) /** * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet * @skb: the packet + * @doi_def: the DOI structure * @secattr: the security attributes * * Description: diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f99e3bac5cab..ce54a30c2ef1 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -29,6 +29,7 @@ #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/fib_rules.h> +#include <linux/indirect_call_wrapper.h> struct fib4_rule { struct fib_rule common; @@ -103,8 +104,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, } EXPORT_SYMBOL_GPL(__fib_lookup); -static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, + struct flowi *flp, int flags, + struct fib_lookup_arg *arg) { int err = -EAGAIN; struct fib_table *tbl; @@ -138,7 +140,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, return err; } -static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, + struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; struct net_device *dev = NULL; @@ -169,7 +172,8 @@ suppress_route: return true; } -static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, + struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; struct flowi4 *fl4 = &fl->u.ip4; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3c65f71d0e82..c89b46fec153 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -13,7 +13,7 @@ * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - * http://www.csc.kth.se/~snilsson/software/dyntrie2/ + * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 2e6d1b7a7bc9..e0a246575887 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,12 +15,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool need_csum, need_recompute_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, gso_partial; if (!skb->encapsulation) goto out; @@ -41,6 +41,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); + need_recompute_csum = skb->csum_not_inet; skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; @@ -98,7 +99,15 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } *(pcsum + 1) = 0; - *pcsum = gso_make_checksum(skb, 0); + if (need_recompute_csum && !skb_is_gso(skb)) { + __wsum csum; + + csum = skb_checksum(skb, gre_offset, + skb->len - gre_offset, 0); + *pcsum = csum_fold(csum); + } else { + *pcsum = gso_make_checksum(skb, 0); + } } while ((skb = skb->next)); out: return segs; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e30515f89802..cf36f955bfe6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1116,6 +1116,65 @@ error: goto drop; } +static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off) +{ + struct icmp_extobj_hdr *objh, _objh; + struct icmp_ext_hdr *exth, _exth; + u16 olen; + + exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth); + if (!exth) + return false; + if (exth->version != 2) + return true; + + if (exth->checksum && + csum_fold(skb_checksum(skb, off, skb->len - off, 0))) + return false; + + off += sizeof(_exth); + while (off < skb->len) { + objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh); + if (!objh) + return false; + + olen = ntohs(objh->length); + if (olen < sizeof(_objh)) + return false; + + off += olen; + if (off > skb->len) + return false; + } + + return true; +} + +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out, + int thlen, int off) +{ + int hlen; + + /* original datagram headers: end of icmph to payload (skb->data) */ + hlen = -skb_transport_offset(skb) - thlen; + + /* per rfc 4884: minimal datagram length of 128 bytes */ + if (off < 128 || off < hlen) + return; + + /* kernel has stripped headers: return payload offset in bytes */ + off -= hlen; + if (off + sizeof(struct icmp_ext_hdr) > skb->len) + return; + + out->len = off; + + if (!ip_icmp_error_rfc4884_validate(skb, off)) + out->flags |= SO_EE_RFC4884_FLAG_INVALID; +} +EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884); + int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index afaf582a5aa9..d1a3913eebe0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -648,20 +648,19 @@ no_route: EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); /* Decide when to expire the request and when to resend SYN-ACK */ -static inline void syn_ack_recalc(struct request_sock *req, const int thresh, - const int max_retries, - const u8 rskq_defer_accept, - int *expire, int *resend) +static void syn_ack_recalc(struct request_sock *req, + const int max_syn_ack_retries, + const u8 rskq_defer_accept, + int *expire, int *resend) { if (!rskq_defer_accept) { - *expire = req->num_timeout >= thresh; + *expire = req->num_timeout >= max_syn_ack_retries; *resend = 1; return; } - *expire = req->num_timeout >= thresh && - (!inet_rsk(req)->acked || req->num_timeout >= max_retries); - /* - * Do not resend while waiting for data after ACK, + *expire = req->num_timeout >= max_syn_ack_retries && + (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); + /* Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ @@ -720,15 +719,12 @@ static void reqsk_timer_handler(struct timer_list *t) struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - int qlen, expire = 0, resend = 0; - int max_retries, thresh; - u8 defer_accept; + int max_syn_ack_retries, qlen, expire = 0, resend = 0; if (inet_sk_state_load(sk_listener) != TCP_LISTEN) goto drop; - max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; - thresh = max_retries; + max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means @@ -750,17 +746,14 @@ static void reqsk_timer_handler(struct timer_list *t) if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; - while (thresh > 2) { + while (max_syn_ack_retries > 2) { if (qlen < young) break; - thresh--; + max_syn_ack_retries--; young <<= 1; } } - defer_accept = READ_ONCE(queue->rskq_defer_accept); - if (defer_accept) - max_retries = defer_accept; - syn_ack_recalc(req, thresh, max_retries, defer_accept, + syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); req->rsk_ops->syn_ack_timeout(req); if (!expire && @@ -1064,34 +1057,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); -#ifdef CONFIG_COMPAT -int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_af_ops->compat_getsockopt) - return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname, - optval, optlen); - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); -} -EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); - -int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_af_ops->compat_setsockopt) - return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname, - optval, optlen); - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); -} -EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); -#endif - static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 125f4f8a36b4..4a98dd736270 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -52,6 +52,11 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { + if (proto < 0 || proto >= IPPROTO_MAX) { + mutex_lock(&inet_diag_table_mutex); + return ERR_PTR(-ENOENT); + } + if (!inet_diag_table[proto]) sock_load_diag_module(AF_INET, proto); @@ -181,6 +186,28 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr **req_nlas) +{ + struct nlattr *nla; + int remaining; + + nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + req_nlas[type] = nla; + } +} + +static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, + const struct inet_diag_dump_data *data) +{ + if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) + return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); + return req->sdiag_protocol; +} + #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, @@ -198,7 +225,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, void *info = NULL; cb_data = cb->data; - handler = inet_diag_table[req->sdiag_protocol]; + handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)]; BUG_ON(!handler); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -539,20 +566,25 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, + int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; - int err; + struct inet_diag_dump_data dump_data; + int err, protocol; - handler = inet_diag_lock_handler(req->sdiag_protocol); + memset(&dump_data, 0, sizeof(dump_data)); + inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); + protocol = inet_diag_get_protocol(req, &dump_data); + + handler = inet_diag_lock_handler(protocol); if (IS_ERR(handler)) { err = PTR_ERR(handler); } else if (cmd == SOCK_DIAG_BY_FAMILY) { - struct inet_diag_dump_data empty_dump_data = {}; struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, - .data = &empty_dump_data, + .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { @@ -1103,13 +1135,16 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { + struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; - int err = 0; + int protocol, err = 0; + + protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; - handler = inet_diag_lock_handler(r->sdiag_protocol); + handler = inet_diag_lock_handler(protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r); else @@ -1139,19 +1174,13 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; - int rem, err; + int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; - nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), rem) { - int type = nla_type(nla); - - if (type < __INET_DIAG_REQ_MAX) - cb_data->req_nlas[type] = nla; - } + inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); nla = cb_data->inet_diag_nla_bc; if (nla) { @@ -1237,7 +1266,8 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req); + return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, + sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -1279,7 +1309,8 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) return netlink_dump_start(net->diag_nlsk, skb, h, &c); } - return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); + return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, + nlmsg_data(h)); } static diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2bbaaf0c7176..4eb4cd8d20dd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -246,6 +246,21 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 phash; + + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, phash, skb, doff); + } + return reuse_sk; +} + /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the @@ -265,21 +280,17 @@ static struct sock *inet_lhash2_lookup(struct net *net, struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; - u32 phash = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - return result; - } + result = lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum); + if (result) + return result; + result = sk; hiscore = score; } @@ -288,6 +299,29 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } +static inline struct sock *inet_lookup_run_bpf(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (hashinfo != &tcp_hashinfo) + return NULL; /* only TCP is supported */ + + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -299,6 +333,14 @@ struct sock *__inet_lookup_listener(struct net *net, struct sock *result = NULL; unsigned int hash2; + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + result = inet_lookup_run_bpf(net, hashinfo, skb, doff, + saddr, sport, daddr, hnum); + if (result) + goto done; + } + hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ddaa01ec2bce..948747aac4e2 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -519,15 +519,20 @@ void ip_options_undo(struct ip_options *opt) } } -static struct ip_options_rcu *ip_options_get_alloc(const int optlen) +int ip_options_get(struct net *net, struct ip_options_rcu **optp, + sockptr_t data, int optlen) { - return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), + struct ip_options_rcu *opt; + + opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); -} + if (!opt) + return -ENOMEM; + if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } -static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, - struct ip_options_rcu *opt, int optlen) -{ while (optlen & 3) opt->opt.__data[optlen++] = IPOPT_END; opt->opt.optlen = optlen; @@ -540,32 +545,6 @@ static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, return 0; } -int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, - unsigned char __user *data, int optlen) -{ - struct ip_options_rcu *opt = ip_options_get_alloc(optlen); - - if (!opt) - return -ENOMEM; - if (optlen && copy_from_user(opt->opt.__data, data, optlen)) { - kfree(opt); - return -EFAULT; - } - return ip_options_get_finish(net, optp, opt, optlen); -} - -int ip_options_get(struct net *net, struct ip_options_rcu **optp, - unsigned char *data, int optlen) -{ - struct ip_options_rcu *opt = ip_options_get_alloc(optlen); - - if (!opt) - return -ENOMEM; - if (optlen) - memcpy(opt->opt.__data, data, optlen); - return ip_options_get_finish(net, optp, opt, optlen); -} - void ip_forward_options(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 17206677d503..61f802d5350c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -539,6 +539,12 @@ no_route: } EXPORT_SYMBOL(__ip_queue_xmit); +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +{ + return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); +} +EXPORT_SYMBOL(ip_queue_xmit); + static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 84ec3703c909..d2c223554ff7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -280,7 +280,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, err = cmsg->cmsg_len - sizeof(struct cmsghdr); /* Our caller is responsible for freeing ipc->opt */ - err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), + err = ip_options_get(net, &ipc->opt, + KERNEL_SOCKPTR(CMSG_DATA(cmsg)), err < 40 ? err : 40); if (err) return err; @@ -389,6 +390,18 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } +static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr), + icmp_hdr(skb)->un.reserved[1] * 4); + } +} + void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -411,6 +424,9 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; if (skb_pull(skb, payload - skb->data)) { + if (inet_sk(sk)->recverr_rfc4884) + ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; @@ -679,20 +695,48 @@ Eaddrnotavail: return -EADDRNOTAVAIL; } +static int copy_group_source_from_sockptr(struct group_source_req *greqs, + sockptr_t optval, int optlen) +{ + if (in_compat_syscall()) { + struct compat_group_source_req gr32; + + if (optlen != sizeof(gr32)) + return -EINVAL; + if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) + return -EFAULT; + greqs->gsr_interface = gr32.gsr_interface; + greqs->gsr_group = gr32.gsr_group; + greqs->gsr_source = gr32.gsr_source; + } else { + if (optlen != sizeof(*greqs)) + return -EINVAL; + if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) + return -EFAULT; + } + + return 0; +} + static int do_mcast_group_source(struct sock *sk, int optname, - struct group_source_req *greqs) + sockptr_t optval, int optlen) { + struct group_source_req greqs; struct ip_mreq_source mreqs; struct sockaddr_in *psin; int omode, add, err; - if (greqs->gsr_group.ss_family != AF_INET || - greqs->gsr_source.ss_family != AF_INET) + err = copy_group_source_from_sockptr(&greqs, optval, optlen); + if (err) + return err; + + if (greqs.gsr_group.ss_family != AF_INET || + greqs.gsr_source.ss_family != AF_INET) return -EADDRNOTAVAIL; - psin = (struct sockaddr_in *)&greqs->gsr_group; + psin = (struct sockaddr_in *)&greqs.gsr_group; mreqs.imr_multiaddr = psin->sin_addr.s_addr; - psin = (struct sockaddr_in *)&greqs->gsr_source; + psin = (struct sockaddr_in *)&greqs.gsr_source; mreqs.imr_sourceaddr = psin->sin_addr.s_addr; mreqs.imr_interface = 0; /* use index for mc_source */ @@ -705,25 +749,145 @@ static int do_mcast_group_source(struct sock *sk, int optname, } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct ip_mreqn mreq; - psin = (struct sockaddr_in *)&greqs->gsr_group; + psin = (struct sockaddr_in *)&greqs.gsr_group; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; - mreq.imr_ifindex = greqs->gsr_interface; + mreq.imr_ifindex = greqs.gsr_interface; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) return err; - greqs->gsr_interface = mreq.imr_ifindex; + greqs.gsr_interface = mreq.imr_ifindex; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } - return ip_mc_source(add, omode, sk, &mreqs, greqs->gsr_interface); + return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface); +} + +static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) +{ + struct group_filter *gsf = NULL; + int err; + + if (optlen < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (optlen > sysctl_optmem_max) + return -ENOBUFS; + + gsf = memdup_sockptr(optval, optlen); + if (IS_ERR(gsf)) + return PTR_ERR(gsf); + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + err = -ENOBUFS; + if (gsf->gf_numsrc >= 0x1ffffff || + gsf->gf_numsrc > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + goto out_free_gsf; + + err = -EINVAL; + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) + goto out_free_gsf; + + err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc, + gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist); +out_free_gsf: + kfree(gsf); + return err; +} + +static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, + int optlen) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + unsigned int n; + void *p; + int err; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + + err = -EFAULT; + if (copy_from_sockptr(gf32, optval, optlen)) + goto out_free_gsf; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + n = gf32->gf_numsrc; + err = -ENOBUFS; + if (n >= 0x1ffffff) + goto out_free_gsf; + + err = -EINVAL; + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + goto out_free_gsf; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + err = -ENOBUFS; + if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + goto out_free_gsf; + err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, + &gf32->gf_group, gf32->gf_slist); +out_free_gsf: + kfree(p); + return err; } -static int do_ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +static int ip_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct ip_mreqn mreq = { }; + struct sockaddr_in *psin; + struct group_req greq; + + if (optlen < sizeof(struct group_req)) + return -EINVAL; + if (copy_from_sockptr(&greq, optval, sizeof(greq))) + return -EFAULT; + + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + if (optname == MCAST_JOIN_GROUP) + return ip_mc_join_group(sk, &mreq); + return ip_mc_leave_group(sk, &mreq); +} + +static int compat_ip_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct compat_group_req greq; + struct ip_mreqn mreq = { }; + struct sockaddr_in *psin; + + if (optlen < sizeof(struct compat_group_req)) + return -EINVAL; + if (copy_from_sockptr(&greq, optval, sizeof(greq))) + return -EFAULT; + + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + if (optname == MCAST_JOIN_GROUP) + return ip_mc_join_group(sk, &mreq); + return ip_mc_leave_group(sk, &mreq); +} + +static int do_ip_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -755,13 +919,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: + case IP_RECVERR_RFC4884: if (optlen >= sizeof(int)) { - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else if (optlen >= sizeof(char)) { unsigned char ucval; - if (get_user(ucval, (unsigned char __user *) optval)) + if (copy_from_sockptr(&ucval, optval, sizeof(ucval))) return -EFAULT; val = (int) ucval; } @@ -786,8 +951,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (optlen > 40) goto e_inval; - err = ip_options_get_from_user(sock_net(sk), &opt, - optval, optlen); + err = ip_options_get(sock_net(sk), &opt, optval, optlen); if (err) break; old = rcu_dereference_protected(inet->inet_opt, @@ -914,6 +1078,11 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (!val) skb_queue_purge(&sk->sk_error_queue); break; + case IP_RECVERR_RFC4884: + if (val < 0 || val > 1) + goto e_inval; + inet->recverr_rfc4884 = !!val; + break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -980,17 +1149,17 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { - if (copy_from_user(&mreq, optval, sizeof(mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (optlen >= sizeof(struct ip_mreq)) { - if (copy_from_user(&mreq, optval, - sizeof(struct ip_mreq))) + if (copy_from_sockptr(&mreq, optval, + sizeof(struct ip_mreq))) break; } else if (optlen >= sizeof(struct in_addr)) { - if (copy_from_user(&mreq.imr_address, optval, - sizeof(struct in_addr))) + if (copy_from_sockptr(&mreq.imr_address, optval, + sizeof(struct in_addr))) break; } } @@ -1042,11 +1211,12 @@ static int do_ip_setsockopt(struct sock *sk, int level, goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { - if (copy_from_user(&mreq, optval, sizeof(mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); - if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq))) + if (copy_from_sockptr(&mreq, optval, + sizeof(struct ip_mreq))) break; } @@ -1066,7 +1236,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -ENOBUFS; break; } - msf = memdup_user(optval, optlen); + msf = memdup_sockptr(optval, optlen); if (IS_ERR(msf)) { err = PTR_ERR(msf); break; @@ -1097,7 +1267,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (optlen != sizeof(struct ip_mreq_source)) goto e_inval; - if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { + if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) { err = -EFAULT; break; } @@ -1127,77 +1297,24 @@ static int do_ip_setsockopt(struct sock *sk, int level, } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: - { - struct group_req greq; - struct sockaddr_in *psin; - struct ip_mreqn mreq; - - if (optlen < sizeof(struct group_req)) - goto e_inval; - err = -EFAULT; - if (copy_from_user(&greq, optval, sizeof(greq))) - break; - psin = (struct sockaddr_in *)&greq.gr_group; - if (psin->sin_family != AF_INET) - goto e_inval; - memset(&mreq, 0, sizeof(mreq)); - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_ifindex = greq.gr_interface; - - if (optname == MCAST_JOIN_GROUP) - err = ip_mc_join_group(sk, &mreq); + if (in_compat_syscall()) + err = compat_ip_mcast_join_leave(sk, optname, optval, + optlen); else - err = ip_mc_leave_group(sk, &mreq); + err = ip_mcast_join_leave(sk, optname, optval, optlen); break; - } case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - { - struct group_source_req greqs; - - if (optlen != sizeof(struct group_source_req)) - goto e_inval; - if (copy_from_user(&greqs, optval, sizeof(greqs))) { - err = -EFAULT; - break; - } - err = do_mcast_group_source(sk, optname, &greqs); + err = do_mcast_group_source(sk, optname, optval, optlen); break; - } case MCAST_MSFILTER: - { - struct group_filter *gsf = NULL; - - if (optlen < GROUP_FILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - err = -ENOBUFS; - break; - } - gsf = memdup_user(optval, optlen); - if (IS_ERR(gsf)) { - err = PTR_ERR(gsf); - break; - } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - err = set_mcast_msfilter(sk, gsf->gf_interface, - gsf->gf_numsrc, gsf->gf_fmode, - &gsf->gf_group, gsf->gf_slist); -mc_msf_out: - kfree(gsf); + if (in_compat_syscall()) + err = compat_ip_set_mcast_msfilter(sk, optval, optlen); + else + err = ip_set_mcast_msfilter(sk, optval, optlen); break; - } case IP_MULTICAST_ALL: if (optlen < 1) goto e_inval; @@ -1296,8 +1413,8 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); } -int ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { int err; @@ -1322,138 +1439,6 @@ int ip_setsockopt(struct sock *sk, int level, } EXPORT_SYMBOL(ip_setsockopt); -#ifdef CONFIG_COMPAT -int compat_ip_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - - if (level != SOL_IP) - return -ENOPROTOOPT; - - switch (optname) { - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct compat_group_req __user *gr32 = (void __user *)optval; - struct group_req greq; - struct sockaddr_in *psin = (struct sockaddr_in *)&greq.gr_group; - struct ip_mreqn mreq; - - if (optlen < sizeof(struct compat_group_req)) - return -EINVAL; - - if (get_user(greq.gr_interface, &gr32->gr_interface) || - copy_from_user(&greq.gr_group, &gr32->gr_group, - sizeof(greq.gr_group))) - return -EFAULT; - - if (psin->sin_family != AF_INET) - return -EINVAL; - - memset(&mreq, 0, sizeof(mreq)); - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_ifindex = greq.gr_interface; - - rtnl_lock(); - lock_sock(sk); - if (optname == MCAST_JOIN_GROUP) - err = ip_mc_join_group(sk, &mreq); - else - err = ip_mc_leave_group(sk, &mreq); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct compat_group_source_req __user *gsr32 = (void __user *)optval; - struct group_source_req greqs; - - if (optlen != sizeof(struct compat_group_source_req)) - return -EINVAL; - - if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || - copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, - sizeof(greqs.gsr_group)) || - copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, - sizeof(greqs.gsr_source))) - return -EFAULT; - - rtnl_lock(); - lock_sock(sk); - err = do_mcast_group_source(sk, optname, &greqs); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_MSFILTER: - { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter *gf32; - unsigned int n; - void *p; - - if (optlen < size0) - return -EINVAL; - if (optlen > sysctl_optmem_max - 4) - return -ENOBUFS; - - p = kmalloc(optlen + 4, GFP_KERNEL); - if (!p) - return -ENOMEM; - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ - if (copy_from_user(gf32, optval, optlen)) { - err = -EFAULT; - goto mc_msf_out; - } - - n = gf32->gf_numsrc; - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (n >= 0x1ffffff) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - - rtnl_lock(); - lock_sock(sk); - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) - err = -ENOBUFS; - else - err = set_mcast_msfilter(sk, gf32->gf_interface, - n, gf32->gf_fmode, - &gf32->gf_group, gf32->gf_slist); - release_sock(sk); - rtnl_unlock(); -mc_msf_out: - kfree(p); - return err; - } - } - - err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && - optname != IP_XFRM_POLICY && - !ip_mroute_opt(optname)) - err = compat_nf_setsockopt(sk, PF_INET, optname, optval, - optlen); -#endif - return err; -} -EXPORT_SYMBOL(compat_ip_setsockopt); -#endif - /* * Get the options. Note for future reference. The GET of IP options gets * the _received_ ones. The set sets the _sent_ ones. @@ -1469,8 +1454,67 @@ static bool getsockopt_needs_rtnl(int optname) return false; } +static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, + int __user *optlen, int len) +{ + const int size0 = offsetof(struct group_filter, gf_slist); + struct group_filter __user *p = optval; + struct group_filter gsf; + int num; + int err; + + if (len < size0) + return -EINVAL; + if (copy_from_user(&gsf, p, size0)) + return -EFAULT; + + num = gsf.gf_numsrc; + err = ip_mc_gsfget(sk, &gsf, p->gf_slist); + if (err) + return err; + if (gsf.gf_numsrc < num) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + return -EFAULT; + return 0; +} + +static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, + int __user *optlen, int len) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = optval; + struct compat_group_filter gf32; + struct group_filter gf; + int num; + int err; + + if (len < size0) + return -EINVAL; + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + err = ip_mc_gsfget(sk, &gf, p->gf_slist); + if (err) + return err; + if (gf.gf_numsrc < num) + num = gf.gf_numsrc; + len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); + if (put_user(len, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; +} + static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen, unsigned int flags) + char __user *optval, int __user *optlen) { struct inet_sock *inet = inet_sk(sk); bool needs_rtnl = getsockopt_needs_rtnl(optname); @@ -1588,6 +1632,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVERR: val = inet->recverr; break; + case IP_RECVERR_RFC4884: + val = inet->recverr_rfc4884; + break; case IP_MULTICAST_TTL: val = inet->mc_ttl; break; @@ -1627,31 +1674,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, goto out; } case MCAST_MSFILTER: - { - struct group_filter __user *p = (void __user *)optval; - struct group_filter gsf; - const int size0 = offsetof(struct group_filter, gf_slist); - int num; - - if (len < size0) { - err = -EINVAL; - goto out; - } - if (copy_from_user(&gsf, p, size0)) { - err = -EFAULT; - goto out; - } - num = gsf.gf_numsrc; - err = ip_mc_gsfget(sk, &gsf, p->gf_slist); - if (err) - goto out; - if (gsf.gf_numsrc < num) - num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) - err = -EFAULT; + if (in_compat_syscall()) + err = compat_ip_get_mcast_msfilter(sk, optval, optlen, + len); + else + err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; - } case IP_MULTICAST_ALL: val = inet->mc_all; break; @@ -1667,7 +1695,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, msg.msg_control_is_user = true; msg.msg_control_user = optval; msg.msg_controllen = len; - msg.msg_flags = flags; + msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; if (inet->cmsg_flags & IP_CMSG_PKTINFO) { struct in_pktinfo info; @@ -1731,7 +1759,8 @@ int ip_getsockopt(struct sock *sk, int level, { int err; - err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); + err = do_ip_getsockopt(sk, level, optname, optval, optlen); + #if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) @@ -1755,79 +1784,3 @@ int ip_getsockopt(struct sock *sk, int level, return err; } EXPORT_SYMBOL(ip_getsockopt); - -#ifdef CONFIG_COMPAT -int compat_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - int err; - - if (optname == MCAST_MSFILTER) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter __user *p = (void __user *)optval; - struct compat_group_filter gf32; - struct group_filter gf; - int ulen, err; - int num; - - if (level != SOL_IP) - return -EOPNOTSUPP; - - if (get_user(ulen, optlen)) - return -EFAULT; - - if (ulen < size0) - return -EINVAL; - - if (copy_from_user(&gf32, p, size0)) - return -EFAULT; - - gf.gf_interface = gf32.gf_interface; - gf.gf_fmode = gf32.gf_fmode; - num = gf.gf_numsrc = gf32.gf_numsrc; - gf.gf_group = gf32.gf_group; - - rtnl_lock(); - lock_sock(sk); - err = ip_mc_gsfget(sk, &gf, p->gf_slist); - release_sock(sk); - rtnl_unlock(); - if (err) - return err; - if (gf.gf_numsrc < num) - num = gf.gf_numsrc; - ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); - if (put_user(ulen, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) - return -EFAULT; - return 0; - } - - err = do_ip_getsockopt(sk, level, optname, optval, optlen, - MSG_CMSG_COMPAT); - -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_GET_INFO && - optname < BPFILTER_IPT_GET_MAX) - err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -#endif -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && - !ip_mroute_opt(optname)) { - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len); - if (err >= 0) - err = put_user(len, optlen); - return err; - } -#endif - return err; -} -EXPORT_SYMBOL(compat_ip_getsockopt); -#endif diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f8b419e2475c..75c6013ff9a4 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -25,6 +25,7 @@ #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> +#include <net/ip6_checksum.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> @@ -184,6 +185,250 @@ int iptunnel_handle_offloads(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); +/** + * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD + * @skb: Original packet with L2 header + * @mtu: MTU value for ICMP error + * + * Return: length on success, negative error code if message couldn't be built. + */ +static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) +{ + const struct iphdr *iph = ip_hdr(skb); + struct icmphdr *icmph; + struct iphdr *niph; + struct ethhdr eh; + int len, err; + + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) + return -EINVAL; + + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); + pskb_pull(skb, ETH_HLEN); + skb_reset_network_header(skb); + + err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph)); + if (err) + return err; + + len = skb->len + sizeof(*icmph); + err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN); + if (err) + return err; + + icmph = skb_push(skb, sizeof(*icmph)); + *icmph = (struct icmphdr) { + .type = ICMP_DEST_UNREACH, + .code = ICMP_FRAG_NEEDED, + .checksum = 0, + .un.frag.__unused = 0, + .un.frag.mtu = ntohs(mtu), + }; + icmph->checksum = ip_compute_csum(icmph, len); + skb_reset_transport_header(skb); + + niph = skb_push(skb, sizeof(*niph)); + *niph = (struct iphdr) { + .ihl = sizeof(*niph) / 4u, + .version = 4, + .tos = 0, + .tot_len = htons(len + sizeof(*niph)), + .id = 0, + .frag_off = htons(IP_DF), + .ttl = iph->ttl, + .protocol = IPPROTO_ICMP, + .saddr = iph->daddr, + .daddr = iph->saddr, + }; + ip_send_check(niph); + skb_reset_network_header(skb); + + skb->ip_summed = CHECKSUM_NONE; + + eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + skb_reset_mac_header(skb); + + return skb->len; +} + +/** + * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @mtu: Network MTU for path + * + * Return: 0 for no ICMP reply, length if built, negative value on error. + */ +static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) +{ + const struct icmphdr *icmph = icmp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); + + if (mtu <= 576 || iph->frag_off != htons(IP_DF)) + return 0; + + if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || + ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) || + ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr)) + return 0; + + if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)) + return 0; + + return iptunnel_pmtud_build_icmp(skb, mtu); +} + +#if IS_ENABLED(CONFIG_IPV6) +/** + * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD + * @skb: Original packet with L2 header + * @mtu: MTU value for ICMPv6 error + * + * Return: length on success, negative error code if message couldn't be built. + */ +static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct icmp6hdr *icmp6h; + struct ipv6hdr *nip6h; + struct ethhdr eh; + int len, err; + __wsum csum; + + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) + return -EINVAL; + + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); + pskb_pull(skb, ETH_HLEN); + skb_reset_network_header(skb); + + err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h)); + if (err) + return err; + + len = skb->len + sizeof(*icmp6h); + err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN); + if (err) + return err; + + icmp6h = skb_push(skb, sizeof(*icmp6h)); + *icmp6h = (struct icmp6hdr) { + .icmp6_type = ICMPV6_PKT_TOOBIG, + .icmp6_code = 0, + .icmp6_cksum = 0, + .icmp6_mtu = htonl(mtu), + }; + skb_reset_transport_header(skb); + + nip6h = skb_push(skb, sizeof(*nip6h)); + *nip6h = (struct ipv6hdr) { + .priority = 0, + .version = 6, + .flow_lbl = { 0 }, + .payload_len = htons(len), + .nexthdr = IPPROTO_ICMPV6, + .hop_limit = ip6h->hop_limit, + .saddr = ip6h->daddr, + .daddr = ip6h->saddr, + }; + skb_reset_network_header(skb); + + csum = csum_partial(icmp6h, len, 0); + icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len, + IPPROTO_ICMPV6, csum); + + skb->ip_summed = CHECKSUM_NONE; + + eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + skb_reset_mac_header(skb); + + return skb->len; +} + +/** + * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @mtu: Network MTU for path + * + * Return: 0 for no ICMPv6 reply, length if built, negative value on error. + */ +static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int stype = ipv6_addr_type(&ip6h->saddr); + u8 proto = ip6h->nexthdr; + __be16 frag_off; + int offset; + + if (mtu <= IPV6_MIN_MTU) + return 0; + + if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || + stype == IPV6_ADDR_LOOPBACK) + return 0; + + offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, + &frag_off); + if (offset < 0 || (frag_off & htons(~0x7))) + return 0; + + if (proto == IPPROTO_ICMPV6) { + struct icmp6hdr *icmp6h; + + if (!pskb_may_pull(skb, skb_network_header(skb) + + offset + 1 - skb->data)) + return 0; + + icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset); + if (icmpv6_is_err(icmp6h->icmp6_type) || + icmp6h->icmp6_type == NDISC_REDIRECT) + return 0; + } + + return iptunnel_pmtud_build_icmpv6(skb, mtu); +} +#endif /* IS_ENABLED(CONFIG_IPV6) */ + +/** + * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @encap_dst: Destination for tunnel encapsulation (outer IP) + * @headroom: Encapsulation header size, bytes + * @reply: Build matching ICMP or ICMPv6 message as a result + * + * L2 tunnel implementations that can carry IP and can be directly bridged + * (currently UDP tunnels) can't always rely on IP forwarding paths to handle + * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built + * based on payload and sent back by the encapsulation itself. + * + * For routable interfaces, we just need to update the PMTU for the destination. + * + * Return: 0 if ICMP error not needed, length if built, negative value on error + */ +int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, + int headroom, bool reply) +{ + u32 mtu = dst_mtu(encap_dst) - headroom; + + if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) || + (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu)) + return 0; + + skb_dst_update_pmtu_no_confirm(skb, mtu); + + if (!reply || skb->pkt_type == PACKET_HOST) + return 0; + + if (skb->protocol == htons(ETH_P_IP)) + return iptunnel_pmtud_check_icmp(skb, mtu); + +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + return iptunnel_pmtud_check_icmpv6(skb, mtu); +#endif + return 0; +} +EXPORT_SYMBOL(skb_tunnel_check_pmtu); + /* Often modified stats are per cpu, other are shared (netdev->stats) */ void ip_tunnel_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 460ca1099e8a..49daaed89764 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -91,32 +91,6 @@ static int vti_rcv_proto(struct sk_buff *skb) return vti_rcv(skb, 0, false); } -static int vti_rcv_tunnel(struct sk_buff *skb) -{ - struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id); - const struct iphdr *iph = ip_hdr(skb); - struct ip_tunnel *tunnel; - - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->saddr, iph->daddr, 0); - if (tunnel) { - struct tnl_ptk_info tpi = { - .proto = htons(ETH_P_IP), - }; - - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto, false)) - goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false); - } - - return -EINVAL; -drop: - kfree_skb(skb); - return 0; -} - static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -244,12 +218,15 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, } dst_hold(dst); - dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); + dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { dev->stats.tx_carrier_errors++; goto tx_error_icmp; } + if (dst->flags & DST_XFRM_QUEUE) + goto queued; + if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { dev->stats.tx_carrier_errors++; dst_release(dst); @@ -281,6 +258,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } +queued: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; @@ -496,11 +474,29 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .priority = 100, }; -static struct xfrm_tunnel ipip_handler __read_mostly = { +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +static int vti_rcv_tunnel(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); +} + +static struct xfrm_tunnel vti_ipip_handler __read_mostly = { + .handler = vti_rcv_tunnel, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 0, +}; + +static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, + .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; +#endif static int __net_init vti_init_net(struct net *net) { @@ -670,10 +666,17 @@ static int __init vti_init(void) if (err < 0) goto xfrm_proto_comp_failed; +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; - err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ipip_failed; +#if IS_ENABLED(CONFIG_IPV6) + err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) - goto xfrm_tunnel_failed; + goto xfrm_tunnel_ipip6_failed; +#endif +#endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); @@ -683,8 +686,14 @@ static int __init vti_init(void) return err; rtnl_link_failed: - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); -xfrm_tunnel_failed: +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +#if IS_ENABLED(CONFIG_IPV6) + xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); +xfrm_tunnel_ipip6_failed: +#endif + xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); +xfrm_tunnel_ipip_failed: +#endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); @@ -700,7 +709,12 @@ pernet_dev_failed: static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +#if IS_ENABLED(CONFIG_IPV6) + xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); +#endif + xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); +#endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 59bfa3825810..b42683212c65 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -72,6 +72,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->props.flags = x->props.flags; t->props.extra_flags = x->props.extra_flags; memcpy(&t->mark, &x->mark, sizeof(t->mark)); + t->if_id = x->if_id; if (xfrm_init_state(t)) goto error; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index f5c7a58844a4..876fd6ff1ff9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -636,7 +636,10 @@ static int call_ipmr_mfc_entry_notifiers(struct net *net, /** * vif_delete - Delete a VIF entry + * @mrt: Table to delete from + * @vifi: VIF identifier to delete * @notify: Set to 1, if the caller is a notifier_call + * @head: if unregistering the VIF, place it on this queue */ static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) @@ -1338,7 +1341,7 @@ static void mrtsock_destruct(struct sock *sk) * MOSPF/PIM router set up we can clean this up. */ -int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, +int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { struct net *net = sock_net(sk); @@ -1410,7 +1413,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (copy_from_user(&vif, optval, sizeof(vif))) { + if (copy_from_sockptr(&vif, optval, sizeof(vif))) { ret = -EFAULT; break; } @@ -1438,7 +1441,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (copy_from_user(&mfc, optval, sizeof(mfc))) { + if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) { ret = -EFAULT; break; } @@ -1456,7 +1459,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1468,7 +1471,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1483,7 +1486,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1505,7 +1508,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(uval, (u32 __user *)optval)) { + if (copy_from_sockptr(&uval, optval, sizeof(uval))) { ret = -EFAULT; break; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b167f4a5b684..d1e04d2b5170 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -787,8 +787,7 @@ static int compat_table_info(const struct xt_table_info *info, } #endif -static int get_info(struct net *net, void __user *user, - const int *len, int compat) +static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -802,7 +801,7 @@ static int get_info(struct net *net, void __user *user, name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_lock(NFPROTO_ARP); #endif t = xt_request_find_table_lock(net, NFPROTO_ARP, name); @@ -812,7 +811,7 @@ static int get_info(struct net *net, void __user *user, #ifdef CONFIG_COMPAT struct xt_table_info tmp; - if (compat) { + if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; @@ -837,7 +836,7 @@ static int get_info(struct net *net, void __user *user, } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_unlock(NFPROTO_ARP); #endif return ret; @@ -948,8 +947,7 @@ static int __do_replace(struct net *net, const char *name, return ret; } -static int do_replace(struct net *net, const void __user *user, - unsigned int len) +static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct arpt_replace tmp; @@ -957,7 +955,7 @@ static int do_replace(struct net *net, const void __user *user, void *loc_cpu_entry; struct arpt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -973,8 +971,8 @@ static int do_replace(struct net *net, const void __user *user, return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -997,8 +995,7 @@ static int do_replace(struct net *net, const void __user *user, return ret; } -static int do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) +static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; @@ -1009,7 +1006,7 @@ static int do_add_counters(struct net *net, const void __user *user, struct arpt_entry *iter; unsigned int addend; - paddc = xt_copy_counters_from_user(user, len, &tmp, compat); + paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); @@ -1246,8 +1243,7 @@ out_unlock: return ret; } -static int compat_do_replace(struct net *net, void __user *user, - unsigned int len) +static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_arpt_replace tmp; @@ -1255,7 +1251,7 @@ static int compat_do_replace(struct net *net, void __user *user, void *loc_cpu_entry; struct arpt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1271,7 +1267,8 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1294,30 +1291,6 @@ static int compat_do_replace(struct net *net, void __user *user, return ret; } -static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case ARPT_SO_SET_REPLACE: - ret = compat_do_replace(sock_net(sk), user, len); - break; - - case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 1); - break; - - default: - ret = -EINVAL; - } - - return ret; -} - static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, @@ -1425,32 +1398,10 @@ static int compat_get_entries(struct net *net, xt_compat_unlock(NFPROTO_ARP); return ret; } - -static int do_arpt_get_ctl(struct sock *, int, void __user *, int *); - -static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, - int *len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case ARPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 1); - break; - case ARPT_SO_GET_ENTRIES: - ret = compat_get_entries(sock_net(sk), user, len); - break; - default: - ret = do_arpt_get_ctl(sk, cmd, user, len); - } - return ret; -} #endif -static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, + unsigned int len) { int ret; @@ -1459,11 +1410,16 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned switch (cmd) { case ARPT_SO_SET_REPLACE: - ret = do_replace(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(sock_net(sk), arg, len); + else +#endif + ret = do_replace(sock_net(sk), arg, len); break; case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 0); + ret = do_add_counters(sock_net(sk), arg, len); break; default: @@ -1482,11 +1438,16 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len switch (cmd) { case ARPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 0); + ret = get_info(sock_net(sk), user, len); break; case ARPT_SO_GET_ENTRIES: - ret = get_entries(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_get_entries(sock_net(sk), user, len); + else +#endif + ret = get_entries(sock_net(sk), user, len); break; case ARPT_SO_GET_REVISION_TARGET: { @@ -1610,15 +1571,9 @@ static struct nf_sockopt_ops arpt_sockopts = { .set_optmin = ARPT_BASE_CTL, .set_optmax = ARPT_SO_SET_MAX+1, .set = do_arpt_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_arpt_set_ctl, -#endif .get_optmin = ARPT_BASE_CTL, .get_optmax = ARPT_SO_GET_MAX+1, .get = do_arpt_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_arpt_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 5bf9fa06aee0..f15bc21d7301 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -944,8 +944,7 @@ static int compat_table_info(const struct xt_table_info *info, } #endif -static int get_info(struct net *net, void __user *user, - const int *len, int compat) +static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -959,7 +958,7 @@ static int get_info(struct net *net, void __user *user, name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_lock(AF_INET); #endif t = xt_request_find_table_lock(net, AF_INET, name); @@ -969,7 +968,7 @@ static int get_info(struct net *net, void __user *user, #ifdef CONFIG_COMPAT struct xt_table_info tmp; - if (compat) { + if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET); private = &tmp; @@ -995,7 +994,7 @@ static int get_info(struct net *net, void __user *user, } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_unlock(AF_INET); #endif return ret; @@ -1103,7 +1102,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, const void __user *user, unsigned int len) +do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct ipt_replace tmp; @@ -1111,7 +1110,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1127,8 +1126,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1152,8 +1151,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) } static int -do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) +do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; @@ -1164,7 +1162,7 @@ do_add_counters(struct net *net, const void __user *user, struct ipt_entry *iter; unsigned int addend; - paddc = xt_copy_counters_from_user(user, len, &tmp, compat); + paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); @@ -1486,7 +1484,7 @@ out_unlock: } static int -compat_do_replace(struct net *net, void __user *user, unsigned int len) +compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_ipt_replace tmp; @@ -1494,7 +1492,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1510,8 +1508,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1534,31 +1532,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return ret; } -static int -compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IPT_SO_SET_REPLACE: - ret = compat_do_replace(sock_net(sk), user, len); - break; - - case IPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 1); - break; - - default: - ret = -EINVAL; - } - - return ret; -} - struct compat_ipt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; @@ -1634,33 +1607,10 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_unlock(AF_INET); return ret; } - -static int do_ipt_get_ctl(struct sock *, int, void __user *, int *); - -static int -compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 1); - break; - case IPT_SO_GET_ENTRIES: - ret = compat_get_entries(sock_net(sk), user, len); - break; - default: - ret = do_ipt_get_ctl(sk, cmd, user, len); - } - return ret; -} #endif static int -do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; @@ -1669,11 +1619,16 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) switch (cmd) { case IPT_SO_SET_REPLACE: - ret = do_replace(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(sock_net(sk), arg, len); + else +#endif + ret = do_replace(sock_net(sk), arg, len); break; case IPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 0); + ret = do_add_counters(sock_net(sk), arg, len); break; default: @@ -1693,11 +1648,16 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) switch (cmd) { case IPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 0); + ret = get_info(sock_net(sk), user, len); break; case IPT_SO_GET_ENTRIES: - ret = get_entries(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_get_entries(sock_net(sk), user, len); + else +#endif + ret = get_entries(sock_net(sk), user, len); break; case IPT_SO_GET_REVISION_MATCH: @@ -1886,15 +1846,9 @@ static struct nf_sockopt_ops ipt_sockopts = { .set_optmin = IPT_BASE_CTL, .set_optmax = IPT_SO_SET_MAX+1, .set = do_ipt_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_ipt_set_ctl, -#endif .get_optmin = IPT_BASE_CTL, .get_optmax = IPT_SO_GET_MAX+1, .get = do_ipt_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_ipt_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f8755a4ae9d4..a8b980ad11d4 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -3,7 +3,7 @@ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> * based on ideas of Fabio Olive Leite <olive@unixforge.org> * - * Development of this code funded by SuSE Linux AG, http://www.suse.com/ + * Development of this code funded by SuSE Linux AG, https://www.suse.com/ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 2361fdac2c43..9dcfa4e461b6 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -96,6 +96,21 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, } EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); +static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) +{ + struct dst_entry *dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(struct flowi)); + fl.u.ip4.daddr = ip_hdr(skb_in)->saddr; + nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false); + if (!dst) + return -1; + + skb_dst_set(skb_in, dst); + return 0; +} + /* Send RST reply */ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { @@ -109,6 +124,9 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) if (!oth) return; + if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(oldskb)) + return; + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) return; @@ -175,6 +193,9 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; + if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(skb_in)) + return; + if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) { icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); return; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 75545a829a2b..1074df726ec0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -292,6 +292,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), + SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 47665919048f..6fd4330287c2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -809,11 +809,11 @@ static int raw_sk_init(struct sock *sk) return 0; } -static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) +static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen) { if (optlen > sizeof(struct icmp_filter)) optlen = sizeof(struct icmp_filter); - if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) + if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; } @@ -838,7 +838,7 @@ out: return ret; } static int do_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) @@ -850,23 +850,13 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname, } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); return do_raw_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level != SOL_RAW) - return compat_ip_setsockopt(sk, level, optname, optval, optlen); - return do_raw_setsockopt(sk, level, optname, optval, optlen); -} -#endif - static int do_raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -887,16 +877,6 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, return do_raw_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_raw_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_RAW) - return compat_ip_getsockopt(sk, level, optname, optval, optlen); - return do_raw_getsockopt(sk, level, optname, optval, optlen); -} -#endif - static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch (cmd) { @@ -980,8 +960,6 @@ struct proto raw_prot = { .usersize = sizeof_field(struct raw_sock, filter), .h.raw_hash = &raw_v4_hashinfo, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_raw_setsockopt, - .compat_getsockopt = compat_raw_getsockopt, .compat_ioctl = compat_raw_ioctl, #endif .diag_destroy = raw_abort, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a01efa062f6b..8ca6bcab7b03 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1050,6 +1050,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); + + /* Don't make lookup fail for bridged encapsulations */ + if (skb && netif_is_any_bridge_port(skb->dev)) + fl4.flowi4_oif = 0; + __ip_rt_update_pmtu(rt, &fl4, mtu); } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 9a4f6b16c9bc..f0794f0232ba 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -212,6 +212,12 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, refcount_set(&req->rsk_refcnt, 1); tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); + + if (rsk_drop_req(req)) { + refcount_set(&req->rsk_refcnt, 2); + return child; + } + if (inet_csk_reqsk_queue_add(sk, req, child)) return child; @@ -276,6 +282,40 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk, + struct sk_buff *skb) +{ + struct request_sock *req; + +#ifdef CONFIG_MPTCP + struct tcp_request_sock *treq; + + if (sk_is_mptcp(sk)) + ops = &mptcp_subflow_request_sock_ops; +#endif + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return NULL; + +#if IS_ENABLED(CONFIG_MPTCP) + treq = tcp_rsk(req); + treq->is_mptcp = sk_is_mptcp(sk); + if (treq->is_mptcp) { + int err = mptcp_subflow_init_cookie_req(req, sk, skb); + + if (err) { + reqsk_free(req); + return NULL; + } + } +#endif + + return req; +} +EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); + /* On input, sk is a listener. * Output is listener if incoming packet would not create a child * NULL if memory could not be allocated. @@ -326,7 +366,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ + req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb); if (!req) goto out; @@ -350,9 +390,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq->snt_synack = 0; treq->tfo_listener = false; - if (IS_ENABLED(CONFIG_MPTCP)) - treq->is_mptcp = 0; - if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6f0caf9a866d..c06d2bfd2ec4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2764,7 +2764,7 @@ static inline bool tcp_can_repair_sock(const struct sock *sk) (sk->sk_state != TCP_LISTEN); } -static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) +static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; @@ -2774,7 +2774,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l if (len != sizeof(opt)) return -EINVAL; - if (copy_from_user(&opt, optbuf, sizeof(opt))) + if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) @@ -2796,17 +2796,18 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l return 0; } -static int tcp_repair_options_est(struct sock *sk, - struct tcp_repair_opt __user *optbuf, unsigned int len) +static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, + unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; + size_t offset = 0; while (len >= sizeof(opt)) { - if (copy_from_user(&opt, optbuf, sizeof(opt))) + if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; - optbuf++; + offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { @@ -2960,7 +2961,7 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val) } EXPORT_SYMBOL(tcp_sock_set_user_timeout); -static int __tcp_sock_set_keepidle(struct sock *sk, int val) +int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); @@ -2987,7 +2988,7 @@ int tcp_sock_set_keepidle(struct sock *sk, int val) int err; lock_sock(sk); - err = __tcp_sock_set_keepidle(sk, val); + err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } @@ -3020,8 +3021,8 @@ EXPORT_SYMBOL(tcp_sock_set_keepcnt); /* * Socket option code for TCP. */ -static int do_tcp_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +static int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -3037,7 +3038,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < 1) return -EINVAL; - val = strncpy_from_user(name, optval, + val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; @@ -3056,7 +3057,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < 1) return -EINVAL; - val = strncpy_from_user(name, optval, + val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) @@ -3079,7 +3080,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; - if (copy_from_user(key, optval, optlen)) + if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) @@ -3095,7 +3096,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3174,9 +3175,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED) - err = tcp_repair_options_est(sk, - (struct tcp_repair_opt __user *)optval, - optlen); + err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; @@ -3186,7 +3185,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_KEEPIDLE: - err = __tcp_sock_set_keepidle(sk, val); + err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) @@ -3325,7 +3324,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return err; } -int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, +int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3337,18 +3336,6 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } EXPORT_SYMBOL(tcp_setsockopt); -#ifdef CONFIG_COMPAT -int compat_tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level != SOL_TCP) - return inet_csk_compat_setsockopt(sk, level, optname, - optval, optlen); - return do_tcp_setsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_tcp_setsockopt); -#endif - static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { @@ -3514,10 +3501,12 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ 0; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3571,6 +3560,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); + nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, + TCP_NLA_PAD); return stats; } @@ -3896,18 +3887,6 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, } EXPORT_SYMBOL(tcp_getsockopt); -#ifdef CONFIG_COMPAT -int compat_tcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_TCP) - return inet_csk_compat_getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_tcp_getsockopt); -#endif - #ifdef CONFIG_TCP_MD5SIG static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); static DEFINE_MUTEX(tcp_md5sig_mutex); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index bfdfbb972c57..349069d6cd0a 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -2,7 +2,7 @@ /* * Sally Floyd's High Speed TCP (RFC 3649) congestion control * - * See http://www.icir.org/floyd/hstcp.html + * See https://www.icir.org/floyd/hstcp.html * * John Heffner <jheffner@psc.edu> */ diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 88e1f011afe0..55adcfcf96fe 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -4,7 +4,7 @@ * R.N.Shorten, D.J.Leith: * "H-TCP: TCP for high-speed and long-distance networks" * Proc. PFLDnet, Argonne, 2004. - * http://www.hamilton.ie/net/htcp3.pdf + * https://www.hamilton.ie/net/htcp3.pdf */ #include <linux/mm.h> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 518f04355fbf..184ea556f50e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -518,7 +518,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. - * <http://public.lanl.gov/radiant/pubs.html#DRS> + * <https://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, @@ -871,12 +871,41 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } +struct tcp_sacktag_state { + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + u64 first_sackt; + u64 last_sackt; + u32 reord; + u32 sack_delivered; + int flag; + unsigned int mss_now; + struct rate_sample *rate; +}; + /* Take a notice that peer is sending D-SACKs */ -static void tcp_dsack_seen(struct tcp_sock *tp) +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, + u32 end_seq, struct tcp_sacktag_state *state) { + u32 seq_len, dup_segs = 1; + + if (before(start_seq, end_seq)) { + seq_len = end_seq - start_seq; + if (seq_len > tp->mss_cache) + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + } + tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; - tp->dsack_dups++; + tp->dsack_dups += dup_segs; + + state->flag |= FLAG_DSACKING_ACK; + /* A spurious retransmission is delivered */ + state->sack_delivered += dup_segs; + + return dup_segs; } /* It's reordering when higher sequence was delivered (i.e. sacked) before @@ -962,6 +991,15 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) } } +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tp->delivered_ce += delivered; +} + /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -1094,52 +1132,38 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, - u32 prior_snd_una) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); - bool dup_sack = false; + u32 dup_segs; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { - dup_sack = true; - tcp_dsack_seen(tp); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); - if (!after(end_seq_0, end_seq_1) && - !before(start_seq_0, start_seq_1)) { - dup_sack = true; - tcp_dsack_seen(tp); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPDSACKOFORECV); - } + if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) + return false; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); + } else { + return false; } + dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); + /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && + if (tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); - return dup_sack; + return true; } -struct tcp_sacktag_state { - u32 reord; - /* Timestamps for earliest and latest never-retransmitted segment - * that was SACKed. RTO needs the earliest RTT to stay conservative, - * but congestion control should still get an accurate delay signal. - */ - u64 first_sackt; - u64 last_sackt; - struct rate_sample *rate; - int flag; - unsigned int mss_now; -}; - /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment @@ -1258,7 +1282,8 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; - tp->delivered += pcount; /* Out-of-order packets delivered */ + /* Out-of-order packets delivered */ + state->sack_delivered += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (tp->lost_skb_hint && @@ -1681,11 +1706,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, tcp_highest_sack_reset(sk); found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, - num_sacks, prior_snd_una); - if (found_dup_sack) { - state->flag |= FLAG_DSACKING_ACK; - tp->delivered++; /* A spurious retransmission is delivered */ - } + num_sacks, prior_snd_una, state); /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1893,7 +1914,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk, int num_dupack) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { if (num_dupack) { struct tcp_sock *tp = tcp_sk(sk); @@ -1904,20 +1925,21 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack) tcp_check_reno_reordering(sk, 0); delivered = tp->sacked_out - prior_sacked; if (delivered > 0) - tp->delivered += delivered; + tcp_count_delivered(tp, delivered, ece_ack); tcp_verify_left_out(tp); } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - tp->delivered += max_t(int, acked - tp->sacked_out, 1); + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), + ece_ack); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -2697,7 +2719,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, * delivered. Lower inflight to clock out (re)tranmissions. */ if (after(tp->snd_nxt, tp->high_seq) && num_dupack) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2779,6 +2801,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; + bool ece_ack = flag & FLAG_ECE; bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && tcp_force_fast_retransmit(sk)); @@ -2787,7 +2810,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ - if (flag & FLAG_ECE) + if (ece_ack) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ @@ -2828,7 +2851,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2853,7 +2876,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2877,7 +2900,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, } /* Otherwise enter Recovery state */ - tcp_enter_recovery(sk, (flag & FLAG_ECE)); + tcp_enter_recovery(sk, ece_ack); fast_rexmit = 1; } @@ -2927,6 +2950,8 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); ca_rtt_us = seq_rtt_us; } @@ -3053,7 +3078,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u32 prior_snd_una, - struct tcp_sacktag_state *sack) + struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; @@ -3078,8 +3103,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u8 sacked = scb->sacked; u32 acked_pcount; - tcp_ack_tstamp(sk, skb, prior_snd_una); - /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || @@ -3114,7 +3137,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; } else if (tcp_is_sack(tp)) { - tp->delivered += acked_pcount; + tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, tcp_skb_timestamp_us(skb)); @@ -3143,6 +3166,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!fully_acked) break; + tcp_ack_tstamp(sk, skb, prior_snd_una); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; @@ -3158,8 +3183,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - flag |= FLAG_SACK_RENEGING; + if (skb) { + tcp_ack_tstamp(sk, skb, prior_snd_una); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + flag |= FLAG_SACK_RENEGING; + } if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); @@ -3191,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, } if (tcp_is_reno(tp)) { - tcp_remove_reno_sacks(sk, pkts_acked); + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); /* If any of the cumulatively ACKed segments was * retransmitted, non-SACK case cannot confirm that @@ -3559,10 +3587,9 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) { - tp->delivered_ce += delivered; + if (flag & FLAG_ECE) NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); - } + return delivered; } @@ -3586,6 +3613,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; + sack_state.sack_delivered = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); @@ -3661,6 +3689,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ack_ev_flags |= CA_ACK_ECE; } + if (sack_state.sack_delivered) + tcp_count_delivered(tp, sack_state.sack_delivered, + flag & FLAG_ECE); + if (flag & FLAG_WIN_UPDATE) ack_ev_flags |= CA_ACK_WIN_UPDATE; @@ -3686,7 +3718,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, + flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); @@ -4427,7 +4460,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket - * @dest: destination queue * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean @@ -6489,7 +6521,6 @@ static void tcp_openreq_init(struct request_sock *req, struct inet_request_sock *ireq = inet_rsk(req); req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ - req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; @@ -6644,6 +6675,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!req) goto drop; + req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; #if IS_ENABLED(CONFIG_MPTCP) @@ -6671,9 +6703,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, af_ops->init_req(req, sk, skb); - if (IS_ENABLED(CONFIG_MPTCP) && want_cookie) - tcp_rsk(req)->is_mptcp = 0; - if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -6709,7 +6738,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 04bfcbbfee83..5084333b5ab6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -76,6 +76,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> +#include <linux/btf_ids.h> #include <crypto/hash.h> #include <linux/scatterlist.h> @@ -1194,7 +1195,7 @@ static void tcp_clear_md5_list(struct sock *sk) } static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, - char __user *optval, int optlen) + sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; @@ -1205,7 +1206,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, if (optlen < sizeof(cmd)) return -EINVAL; - if (copy_from_user(&cmd, optval, sizeof(cmd))) + if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin->sin_family != AF_INET) @@ -2134,10 +2135,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_SYMBOL(ipv4_specific); @@ -2223,13 +2220,18 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk = cur; + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; @@ -2247,7 +2249,8 @@ get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == afinfo->family) + if (afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) return sk; } spin_unlock(&ilb->lock); @@ -2284,11 +2287,16 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); void *rc = NULL; + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + st->offset = 0; for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; @@ -2301,7 +2309,8 @@ static void *established_get_first(struct seq_file *seq) spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if (sk->sk_family != afinfo->family || + if ((afinfo->family != AF_UNSPEC && + sk->sk_family != afinfo->family) || !net_eq(sock_net(sk), net)) { continue; } @@ -2316,19 +2325,25 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct sock *sk = cur; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + ++st->num; ++st->offset; sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == afinfo->family && + if ((afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) && net_eq(sock_net(sk), net)) return sk; } @@ -2607,6 +2622,74 @@ out: return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__tcp { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct sock_common *, sk_common); + uid_t uid __aligned(8); +}; + +static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct sock_common *sk_common, uid_t uid) +{ + struct bpf_iter__tcp ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.sk_common = sk_common; + ctx.uid = uid; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + if (sk->sk_state == TCP_TIME_WAIT) { + uid = 0; + } else if (sk->sk_state == TCP_NEW_SYN_RECV) { + const struct request_sock *req = v; + + uid = from_kuid_munged(seq_user_ns(seq), + sock_i_uid(req->rsk_listener)); + } else { + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + } + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return tcp_prog_seq_show(prog, &meta, v, uid); +} + +static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)tcp_prog_seq_show(prog, &meta, v, 0); + } + + tcp_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_tcp_seq_ops = { + .show = bpf_iter_tcp_seq_show, + .start = tcp_seq_start, + .next = tcp_seq_next, + .stop = bpf_iter_tcp_seq_stop, +}; +#endif + static const struct seq_operations tcp4_seq_ops = { .show = tcp4_seq_show, .start = tcp_seq_start, @@ -2687,10 +2770,6 @@ struct proto tcp_prot = { .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_tcp_setsockopt, - .compat_getsockopt = compat_tcp_getsockopt, -#endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot); @@ -2838,8 +2917,68 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { .exit_batch = tcp_sk_exit_batch, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, + struct sock_common *sk_common, uid_t uid) + +static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct tcp_iter_state *st = priv_data; + struct tcp_seq_afinfo *afinfo; + int ret; + + afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); + if (!afinfo) + return -ENOMEM; + + afinfo->family = AF_UNSPEC; + st->bpf_seq_afinfo = afinfo; + ret = bpf_iter_init_seq_net(priv_data, aux); + if (ret) + kfree(afinfo); + return ret; +} + +static void bpf_iter_fini_tcp(void *priv_data) +{ + struct tcp_iter_state *st = priv_data; + + kfree(st->bpf_seq_afinfo); + bpf_iter_fini_seq_net(priv_data); +} + +static const struct bpf_iter_seq_info tcp_seq_info = { + .seq_ops = &bpf_iter_tcp_seq_ops, + .init_seq_private = bpf_iter_init_tcp, + .fini_seq_private = bpf_iter_fini_tcp, + .seq_priv_size = sizeof(struct tcp_iter_state), +}; + +static struct bpf_iter_reg tcp_reg_info = { + .target = "tcp", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__tcp, sk_common), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &tcp_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; + if (bpf_iter_reg_target(&tcp_reg_info)) + pr_warn("Warning: could not register bpf iterator tcp\n"); +} + +#endif + void __init tcp_v4_init(void) { if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bc05d68cd74..85ff417bda7f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,6 +1066,10 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } +INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1209,7 +1213,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif - icsk->icsk_af_ops->send_check(sk, skb); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, + tcp_v6_send_check, tcp_v4_send_check, + sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); @@ -1237,7 +1243,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_add_tx_delay(skb, tp); - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, + inet6_csk_xmit, ip_queue_xmit, + sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); @@ -3332,6 +3340,8 @@ int tcp_send_synack(struct sock *sk) * sk: listener socket * dst: dst entry attached to the SYNACK * req: request_sock pointer + * foc: cookie for tcp fast open + * synack_type: Type of synback to prepare * * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. @@ -3383,7 +3393,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES - if (unlikely(req->cookie_ts)) + if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ada046f425d2..0c08c420fbc2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -314,7 +314,7 @@ out: /** * tcp_delack_timer() - The TCP delayed ACK timeout handler - * @data: Pointer to the current socket. (gets casted to struct sock *) + * @t: Pointer to the timer. (gets casted to struct sock *) * * This function gets (indirectly) called when the kernel timer for a TCP packet * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 50a9a6e2c4cd..cd50a61c9976 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -7,7 +7,7 @@ * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." * IEEE Journal on Selected Areas in Communication, * Feb. 2003. - * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf + * See https://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf */ #include <linux/mm.h> diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index c4b2ccbeba04..e44aaf41a138 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -110,6 +110,33 @@ drop: return 0; } +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err) +{ + struct xfrm_tunnel __rcu *head; + struct xfrm_tunnel *handler; + int ret; + + head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers; + + for_each_tunnel_rcu(head, handler) { + if (handler->cb_handler) { + ret = handler->cb_handler(skb, err); + if (ret <= 0) + return ret; + } + } + + return 0; +} + +static const struct xfrm_input_afinfo tunnel4_input_afinfo = { + .family = AF_INET, + .is_ipip = true, + .callback = tunnel4_rcv_cb, +}; +#endif + #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_rcv(struct sk_buff *skb) { @@ -231,6 +258,18 @@ static int __init tunnel4_init(void) goto err; } #endif +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) + if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) { + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); +#if IS_ENABLED(CONFIG_IPV6) + inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); +#endif +#if IS_ENABLED(CONFIG_MPLS) + inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS); +#endif + goto err; + } +#endif return 0; err: @@ -240,6 +279,10 @@ err: static void __exit tunnel4_fini(void) { +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) + if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo)) + pr_err("tunnel4 close: can't remove input afinfo\n"); +#endif #if IS_ENABLED(CONFIG_MPLS) if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) pr_err("tunnelmpls4 close: can't remove protocol\n"); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4077d589b72e..e88efba07551 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -106,6 +106,7 @@ #include <net/xfrm.h> #include <trace/events/udp.h> #include <linux/static_key.h> +#include <linux/btf_ids.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" @@ -408,6 +409,22 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } +static struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 hash; + + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { + hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + } + return reuse_sk; +} + /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, @@ -416,9 +433,8 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result, *reuseport_result; + struct sock *sk, *result; int score, badness; - u32 hash = 0; result = NULL; badness = 0; @@ -426,25 +442,42 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - reuseport_result = NULL; - - if (sk->sk_reuseport && - sk->sk_state != TCP_ESTABLISHED) { - hash = udp_ehashfn(net, daddr, hnum, - saddr, sport); - reuseport_result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (reuseport_result && !reuseport_has_conns(sk, false)) - return reuseport_result; - } + result = lookup_reuseport(net, sk, skb, + saddr, sport, daddr, hnum); + /* Fall back to scoring if group has connections */ + if (result && !reuseport_has_conns(sk, false)) + return result; - result = reuseport_result ? : sk; + result = result ? : sk; badness = score; } } return result; } +static struct sock *udp4_lookup_run_bpf(struct net *net, + struct udp_table *udptable, + struct sk_buff *skb, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (udptable != &udp_table) + return NULL; /* only UDP is supported */ + + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -452,27 +485,45 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { - struct sock *result; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; + struct sock *result, *sk; hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; + /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); - if (!result) { - hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; - - result = udp4_lib_lookup2(net, saddr, sport, - htonl(INADDR_ANY), hnum, dif, sdif, - hslot2, skb); + if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) + goto done; + + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + sk = udp4_lookup_run_bpf(net, udptable, skb, + saddr, sport, daddr, hnum); + if (sk) { + result = sk; + goto done; + } } + + /* Got non-wildcard socket or error on first lookup */ + if (result) + goto done; + + /* Lookup wildcard sockets */ + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp4_lib_lookup2(net, saddr, sport, + htonl(INADDR_ANY), hnum, dif, sdif, + hslot2, skb); +done: if (IS_ERR(result)) return NULL; return result; @@ -2535,7 +2586,7 @@ void udp_destroy_sock(struct sock *sk) * Socket option code for UDP */ int udp_lib_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen, + sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); @@ -2546,7 +2597,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; @@ -2650,26 +2701,16 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL(udp_lib_setsockopt); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, + return udp_lib_setsockopt(sk, level, optname, + optval, optlen, udp_push_pending_frames); return ip_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, - udp_push_pending_frames); - return compat_ip_setsockopt(sk, level, optname, optval, optlen); -} -#endif - int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -2735,20 +2776,11 @@ int udp_getsockopt(struct sock *sk, int level, int optname, return ip_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_getsockopt(sk, level, optname, optval, optlen); - return compat_ip_getsockopt(sk, level, optname, optval, optlen); -} -#endif /** * udp_poll - wait for a UDP event. - * @file - file struct - * @sock - socket - * @wait - poll table + * @file: - file struct + * @sock: - socket + * @wait: - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2815,10 +2847,6 @@ struct proto udp_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udp_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, -#endif .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); @@ -2829,10 +2857,15 @@ EXPORT_SYMBOL(udp_prot); static struct sock *udp_get_first(struct seq_file *seq, int start) { struct sock *sk; - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + for (state->bucket = start; state->bucket <= afinfo->udp_table->mask; ++state->bucket) { struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket]; @@ -2844,7 +2877,8 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) sk_for_each(sk, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == afinfo->family) + if (afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) goto found; } spin_unlock_bh(&hslot->lock); @@ -2856,13 +2890,20 @@ found: static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + do { sk = sk_next(sk); - } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != afinfo->family)); + } while (sk && (!net_eq(sock_net(sk), net) || + (afinfo->family != AF_UNSPEC && + sk->sk_family != afinfo->family))); if (!sk) { if (state->bucket <= afinfo->udp_table->mask) @@ -2907,9 +2948,14 @@ EXPORT_SYMBOL(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + if (state->bucket <= afinfo->udp_table->mask) spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock); } @@ -2953,6 +2999,67 @@ int udp4_seq_show(struct seq_file *seq, void *v) return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__udp { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct udp_sock *, udp_sk); + uid_t uid __aligned(8); + int bucket __aligned(8); +}; + +static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct udp_sock *udp_sk, uid_t uid, int bucket) +{ + struct bpf_iter__udp ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.udp_sk = udp_sk; + ctx.uid = uid; + ctx.bucket = bucket; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) +{ + struct udp_iter_state *state = seq->private; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return udp_prog_seq_show(prog, &meta, v, uid, state->bucket); +} + +static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)udp_prog_seq_show(prog, &meta, v, 0, 0); + } + + udp_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_udp_seq_ops = { + .start = udp_seq_start, + .next = udp_seq_next, + .stop = bpf_iter_udp_seq_stop, + .show = bpf_iter_udp_seq_show, +}; +#endif + const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, @@ -3070,6 +3177,62 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_sysctl_init, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, + struct udp_sock *udp_sk, uid_t uid, int bucket) + +static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct udp_iter_state *st = priv_data; + struct udp_seq_afinfo *afinfo; + int ret; + + afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); + if (!afinfo) + return -ENOMEM; + + afinfo->family = AF_UNSPEC; + afinfo->udp_table = &udp_table; + st->bpf_seq_afinfo = afinfo; + ret = bpf_iter_init_seq_net(priv_data, aux); + if (ret) + kfree(afinfo); + return ret; +} + +static void bpf_iter_fini_udp(void *priv_data) +{ + struct udp_iter_state *st = priv_data; + + kfree(st->bpf_seq_afinfo); + bpf_iter_fini_seq_net(priv_data); +} + +static const struct bpf_iter_seq_info udp_seq_info = { + .seq_ops = &bpf_iter_udp_seq_ops, + .init_seq_private = bpf_iter_init_udp, + .fini_seq_private = bpf_iter_fini_udp, + .seq_priv_size = sizeof(struct udp_iter_state), +}; + +static struct bpf_iter_reg udp_reg_info = { + .target = "udp", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__udp, udp_sk), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &udp_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP]; + if (bpf_iter_reg_target(&udp_reg_info)) + pr_warn("Warning: could not register bpf iterator udp\n"); +} +#endif + void __init udp_init(void) { unsigned long limit; @@ -3095,4 +3258,8 @@ void __init udp_init(void) if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 6b2fa77eeb1c..2878d8285caf 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -12,17 +12,11 @@ int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); void udp_v4_rehash(struct sock *sk); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -#ifdef CONFIG_COMPAT -int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -#endif int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel_core.c index 3eecba0874aa..3eecba0874aa 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel_core.c diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c new file mode 100644 index 000000000000..69962165c0e8 --- /dev/null +++ b/net/ipv4/udp_tunnel_nic.c @@ -0,0 +1,897 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020 Facebook Inc. + +#include <linux/ethtool_netlink.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <net/udp_tunnel.h> +#include <net/vxlan.h> + +enum udp_tunnel_nic_table_entry_flags { + UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), + UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), + UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), + UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), +}; + +struct udp_tunnel_nic_table_entry { + __be16 port; + u8 type; + u8 use_cnt; + u8 flags; + u8 hw_priv; +}; + +/** + * struct udp_tunnel_nic - UDP tunnel port offload state + * @work: async work for talking to hardware from process context + * @dev: netdev pointer + * @need_sync: at least one port start changed + * @need_replay: space was freed, we need a replay of all ports + * @work_pending: @work is currently scheduled + * @n_tables: number of tables under @entries + * @missed: bitmap of tables which overflown + * @entries: table of tables of ports currently offloaded + */ +struct udp_tunnel_nic { + struct work_struct work; + + struct net_device *dev; + + u8 need_sync:1; + u8 need_replay:1; + u8 work_pending:1; + + unsigned int n_tables; + unsigned long missed; + struct udp_tunnel_nic_table_entry **entries; +}; + +/* We ensure all work structs are done using driver state, but not the code. + * We need a workqueue we can flush before module gets removed. + */ +static struct workqueue_struct *udp_tunnel_nic_workqueue; + +static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) +{ + switch (type) { + case UDP_TUNNEL_TYPE_VXLAN: + return "vxlan"; + case UDP_TUNNEL_TYPE_GENEVE: + return "geneve"; + case UDP_TUNNEL_TYPE_VXLAN_GPE: + return "vxlan-gpe"; + default: + return "unknown"; + } +} + +static bool +udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt == 0 && !entry->flags; +} + +static bool +udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); +} + +static bool +udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static void +udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) +{ + if (!udp_tunnel_nic_entry_is_free(entry)) + entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static void +udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) +{ + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static bool +udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | + UDP_TUNNEL_NIC_ENTRY_DEL); +} + +static void +udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, + struct udp_tunnel_nic_table_entry *entry, + unsigned int flag) +{ + entry->flags |= flag; + utn->need_sync = 1; +} + +static void +udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, + struct udp_tunnel_info *ti) +{ + memset(ti, 0, sizeof(*ti)); + ti->port = entry->port; + ti->type = entry->type; + ti->hw_priv = entry->hw_priv; +} + +static bool +udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) + return false; + return true; +} + +static bool +udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + if (!utn->missed) + return false; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!test_bit(i, &utn->missed)) + continue; + + for (j = 0; j < table->n_entries; j++) + if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) + return true; + } + + return false; +} + +static void +__udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, + unsigned int idx, struct udp_tunnel_info *ti) +{ + struct udp_tunnel_nic_table_entry *entry; + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + entry = &utn->entries[table][idx]; + + if (entry->use_cnt) + udp_tunnel_nic_ti_from_entry(entry, ti); +} + +static void +__udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, + unsigned int idx, u8 priv) +{ + dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; +} + +static void +udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, + int err) +{ + bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + + WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && + entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); + + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && + (!err || (err == -EEXIST && dodgy))) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; + + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && + (!err || (err == -ENOENT && dodgy))) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; + + if (!err) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + else + entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; +} + +static void +udp_tunnel_nic_device_sync_one(struct net_device *dev, + struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx) +{ + struct udp_tunnel_nic_table_entry *entry; + struct udp_tunnel_info ti; + int err; + + entry = &utn->entries[table][idx]; + if (!udp_tunnel_nic_entry_is_queued(entry)) + return; + + udp_tunnel_nic_ti_from_entry(entry, &ti); + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) + err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); + else + err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, + &ti); + udp_tunnel_nic_entry_update_done(entry, err); + + if (err) + netdev_warn(dev, + "UDP tunnel port sync failed port %d type %s: %d\n", + be16_to_cpu(entry->port), + udp_tunnel_nic_tunnel_type_name(entry->type), + err); +} + +static void +udp_tunnel_nic_device_sync_by_port(struct net_device *dev, + struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_device_sync_one(dev, utn, i, j); +} + +static void +udp_tunnel_nic_device_sync_by_table(struct net_device *dev, + struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + int err; + + for (i = 0; i < utn->n_tables; i++) { + /* Find something that needs sync in this table */ + for (j = 0; j < info->tables[i].n_entries; j++) + if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) + break; + if (j == info->tables[i].n_entries) + continue; + + err = info->sync_table(dev, i); + if (err) + netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", + i, err); + + for (j = 0; j < info->tables[i].n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + if (udp_tunnel_nic_entry_is_queued(entry)) + udp_tunnel_nic_entry_update_done(entry, err); + } + } +} + +static void +__udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + if (!utn->need_sync) + return; + + if (dev->udp_tunnel_nic_info->sync_table) + udp_tunnel_nic_device_sync_by_table(dev, utn); + else + udp_tunnel_nic_device_sync_by_port(dev, utn); + + utn->need_sync = 0; + /* Can't replay directly here, in case we come from the tunnel driver's + * notification - trying to replay may deadlock inside tunnel driver. + */ + utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); +} + +static void +udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + bool may_sleep; + + if (!utn->need_sync) + return; + + /* Drivers which sleep in the callback need to update from + * the workqueue, if we come from the tunnel driver's notification. + */ + may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; + if (!may_sleep) + __udp_tunnel_nic_device_sync(dev, utn); + if (may_sleep || utn->need_replay) { + queue_work(udp_tunnel_nic_workqueue, &utn->work); + utn->work_pending = 1; + } +} + +static bool +udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, + struct udp_tunnel_info *ti) +{ + return table->tunnel_types & ti->type; +} + +static bool +udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i; + + /* Special case IPv4-only NICs */ + if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && + ti->sa_family != AF_INET) + return false; + + for (i = 0; i < utn->n_tables; i++) + if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) + return true; + return false; +} + +static int +udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic_table_entry *entry; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + entry = &utn->entries[i][j]; + + if (!udp_tunnel_nic_entry_is_free(entry) && + entry->port == ti->port && + entry->type != ti->type) { + __set_bit(i, &utn->missed); + return true; + } + } + return false; +} + +static void +udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx, int use_cnt_adj) +{ + struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; + bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + unsigned int from, to; + + /* If not going from used to unused or vice versa - all done. + * For dodgy entries make sure we try to sync again (queue the entry). + */ + entry->use_cnt += use_cnt_adj; + if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) + return; + + /* Cancel the op before it was sent to the device, if possible, + * otherwise we'd need to take special care to issue commands + * in the same order the ports arrived. + */ + if (use_cnt_adj < 0) { + from = UDP_TUNNEL_NIC_ENTRY_ADD; + to = UDP_TUNNEL_NIC_ENTRY_DEL; + } else { + from = UDP_TUNNEL_NIC_ENTRY_DEL; + to = UDP_TUNNEL_NIC_ENTRY_ADD; + } + + if (entry->flags & from) { + entry->flags &= ~from; + if (!dodgy) + return; + } + + udp_tunnel_nic_entry_queue(utn, entry, to); +} + +static bool +udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx, + struct udp_tunnel_info *ti, int use_cnt_adj) +{ + struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; + + if (udp_tunnel_nic_entry_is_free(entry) || + entry->port != ti->port || + entry->type != ti->type) + return false; + + if (udp_tunnel_nic_entry_is_frozen(entry)) + return true; + + udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); + return true; +} + +/* Try to find existing matching entry and adjust its use count, instead of + * adding a new one. Returns true if entry was found. In case of delete the + * entry may have gotten removed in the process, in which case it will be + * queued for removal. + */ +static bool +udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti, int use_cnt_adj) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!udp_tunnel_nic_table_is_capable(table, ti)) + continue; + + for (j = 0; j < table->n_entries; j++) + if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, + use_cnt_adj)) + return true; + } + + return false; +} + +static bool +udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + return udp_tunnel_nic_try_existing(dev, utn, ti, +1); +} + +static bool +udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + return udp_tunnel_nic_try_existing(dev, utn, ti, -1); +} + +static bool +udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!udp_tunnel_nic_table_is_capable(table, ti)) + continue; + + for (j = 0; j < table->n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + if (!udp_tunnel_nic_entry_is_free(entry)) + continue; + + entry->port = ti->port; + entry->type = ti->type; + entry->use_cnt = 1; + udp_tunnel_nic_entry_queue(utn, entry, + UDP_TUNNEL_NIC_ENTRY_ADD); + return true; + } + + /* The different table may still fit this port in, but there + * are no devices currently which have multiple tables accepting + * the same tunnel type, and false positives are okay. + */ + __set_bit(i, &utn->missed); + } + + return false; +} + +static void +__udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) + return; + if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && + ti->port == htons(IANA_VXLAN_UDP_PORT)) { + if (ti->type != UDP_TUNNEL_TYPE_VXLAN) + netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); + return; + } + + if (!udp_tunnel_nic_is_capable(dev, utn, ti)) + return; + + /* It may happen that a tunnel of one type is removed and different + * tunnel type tries to reuse its port before the device was informed. + * Rely on utn->missed to re-add this port later. + */ + if (udp_tunnel_nic_has_collision(dev, utn, ti)) + return; + + if (!udp_tunnel_nic_add_existing(dev, utn, ti)) + udp_tunnel_nic_add_new(dev, utn, ti); + + udp_tunnel_nic_device_sync(dev, utn); +} + +static void +__udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + + if (!udp_tunnel_nic_is_capable(dev, utn, ti)) + return; + + udp_tunnel_nic_del_existing(dev, utn, ti); + + udp_tunnel_nic_device_sync(dev, utn); +} + +static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int i, j; + + ASSERT_RTNL(); + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + + utn->need_sync = false; + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + + entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | + UDP_TUNNEL_NIC_ENTRY_OP_FAIL); + /* We don't release rtnl across ops */ + WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); + if (!entry->use_cnt) + continue; + + udp_tunnel_nic_entry_queue(utn, entry, + UDP_TUNNEL_NIC_ENTRY_ADD); + } + + __udp_tunnel_nic_device_sync(dev, utn); +} + +static size_t +__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int j; + size_t size; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + size = 0; + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + size += nla_total_size(0) + /* _TABLE_ENTRY */ + nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ + nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ + } + + return size; +} + +static int +__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + struct nlattr *nest; + unsigned int j; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + + if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + utn->entries[table][j].port) || + nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + ilog2(utn->entries[table][j].type))) + goto err_cancel; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { + .get_port = __udp_tunnel_nic_get_port, + .set_port_priv = __udp_tunnel_nic_set_port_priv, + .add_port = __udp_tunnel_nic_add_port, + .del_port = __udp_tunnel_nic_del_port, + .reset_ntf = __udp_tunnel_nic_reset_ntf, + .dump_size = __udp_tunnel_nic_dump_size, + .dump_write = __udp_tunnel_nic_dump_write, +}; + +static void +udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + int adj_cnt = -utn->entries[i][j].use_cnt; + + if (adj_cnt) + udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); + } + + __udp_tunnel_nic_device_sync(dev, utn); + + for (i = 0; i < utn->n_tables; i++) + memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, + sizeof(**utn->entries))); + WARN_ON(utn->need_sync); + utn->need_replay = 0; +} + +static void +udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + /* Freeze all the ports we are already tracking so that the replay + * does not double up the refcount. + */ + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); + utn->missed = 0; + utn->need_replay = 0; + + udp_tunnel_get_rx_info(dev); + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); +} + +static void udp_tunnel_nic_device_sync_work(struct work_struct *work) +{ + struct udp_tunnel_nic *utn = + container_of(work, struct udp_tunnel_nic, work); + + rtnl_lock(); + utn->work_pending = 0; + __udp_tunnel_nic_device_sync(utn->dev, utn); + + if (utn->need_replay) + udp_tunnel_nic_replay(utn->dev, utn); + rtnl_unlock(); +} + +static struct udp_tunnel_nic * +udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, + unsigned int n_tables) +{ + struct udp_tunnel_nic *utn; + unsigned int i; + + utn = kzalloc(sizeof(*utn), GFP_KERNEL); + if (!utn) + return NULL; + utn->n_tables = n_tables; + INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); + + utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL); + if (!utn->entries) + goto err_free_utn; + + for (i = 0; i < n_tables; i++) { + utn->entries[i] = kcalloc(info->tables[i].n_entries, + sizeof(*utn->entries[i]), GFP_KERNEL); + if (!utn->entries[i]) + goto err_free_prev_entries; + } + + return utn; + +err_free_prev_entries: + while (i--) + kfree(utn->entries[i]); + kfree(utn->entries); +err_free_utn: + kfree(utn); + return NULL; +} + +static int udp_tunnel_nic_register(struct net_device *dev) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int n_tables, i; + + BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < + UDP_TUNNEL_NIC_MAX_TABLES); + + if (WARN_ON(!info->set_port != !info->unset_port) || + WARN_ON(!info->set_port == !info->sync_table) || + WARN_ON(!info->tables[0].n_entries)) + return -EINVAL; + + n_tables = 1; + for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + continue; + + n_tables++; + if (WARN_ON(!info->tables[i - 1].n_entries)) + return -EINVAL; + } + + utn = udp_tunnel_nic_alloc(info, n_tables); + if (!utn) + return -ENOMEM; + + utn->dev = dev; + dev_hold(dev); + dev->udp_tunnel_nic = utn; + + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + udp_tunnel_get_rx_info(dev); + + return 0; +} + +static void +udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + unsigned int i; + + /* Flush before we check work, so we don't waste time adding entries + * from the work which we will boot immediately. + */ + udp_tunnel_nic_flush(dev, utn); + + /* Wait for the work to be done using the state, netdev core will + * retry unregister until we give up our reference on this device. + */ + if (utn->work_pending) + return; + + for (i = 0; i < utn->n_tables; i++) + kfree(utn->entries[i]); + kfree(utn->entries); + kfree(utn); + dev->udp_tunnel_nic = NULL; + dev_put(dev); +} + +static int +udp_tunnel_nic_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + const struct udp_tunnel_nic_info *info; + struct udp_tunnel_nic *utn; + + info = dev->udp_tunnel_nic_info; + if (!info) + return NOTIFY_DONE; + + if (event == NETDEV_REGISTER) { + int err; + + err = udp_tunnel_nic_register(dev); + if (err) + netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); + return notifier_from_errno(err); + } + /* All other events will need the udp_tunnel_nic state */ + utn = dev->udp_tunnel_nic; + if (!utn) + return NOTIFY_DONE; + + if (event == NETDEV_UNREGISTER) { + udp_tunnel_nic_unregister(dev, utn); + return NOTIFY_OK; + } + + /* All other events only matter if NIC has to be programmed open */ + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + return NOTIFY_DONE; + + if (event == NETDEV_UP) { + WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); + udp_tunnel_get_rx_info(dev); + return NOTIFY_OK; + } + if (event == NETDEV_GOING_DOWN) { + udp_tunnel_nic_flush(dev, utn); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { + .notifier_call = udp_tunnel_nic_netdevice_event, +}; + +static int __init udp_tunnel_nic_init_module(void) +{ + int err; + + udp_tunnel_nic_workqueue = alloc_workqueue("udp_tunnel_nic", 0, 0); + if (!udp_tunnel_nic_workqueue) + return -ENOMEM; + + rtnl_lock(); + udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; + rtnl_unlock(); + + err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); + if (err) + goto err_unset_ops; + + return 0; + +err_unset_ops: + rtnl_lock(); + udp_tunnel_nic_ops = NULL; + rtnl_unlock(); + destroy_workqueue(udp_tunnel_nic_workqueue); + return err; +} +late_initcall(udp_tunnel_nic_init_module); + +static void __exit udp_tunnel_nic_cleanup_module(void) +{ + unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); + + rtnl_lock(); + udp_tunnel_nic_ops = NULL; + rtnl_unlock(); + + destroy_workqueue(udp_tunnel_nic_workqueue); +} +module_exit(udp_tunnel_nic_cleanup_module); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp_tunnel_stub.c b/net/ipv4/udp_tunnel_stub.c new file mode 100644 index 000000000000..c4b2888f5fef --- /dev/null +++ b/net/ipv4/udp_tunnel_stub.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020 Facebook Inc. + +#include <net/udp_tunnel.h> + +const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops; +EXPORT_SYMBOL_GPL(udp_tunnel_nic_ops); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 5936d66d1ce2..bd8773b49e72 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -56,10 +56,6 @@ struct proto udplite_prot = { .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, -#endif }; EXPORT_SYMBOL(udplite_prot); |