diff options
Diffstat (limited to 'net/ipv6')
45 files changed, 1690 insertions, 568 deletions
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 48c452959d2c..ea71e4b0ab7a 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -308,22 +308,12 @@ config IPV6_SEG6_LWTUNNEL depends on IPV6 select LWTUNNEL select DST_CACHE + select IPV6_MULTIPLE_TABLES ---help--- Support for encapsulation of packets within an outer IPv6 header and a Segment Routing Header using the lightweight - tunnels mechanism. - - If unsure, say N. - -config IPV6_SEG6_INLINE - bool "IPv6: direct Segment Routing Header insertion " - depends on IPV6_SEG6_LWTUNNEL - ---help--- - Support for direct insertion of the Segment Routing Header, - also known as inline mode. Be aware that direct insertion of - extension headers (as opposed to encapsulation) may break - multiple mechanisms such as PMTUD or IPSec AH. Use this feature - only if you know exactly what you are doing. + tunnels mechanism. Also enable support for advanced local + processing of SRv6 packets based on their active segment. If unsure, say N. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 217e9ff0e24b..10e342363793 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -9,7 +9,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ - udp_offload.o seg6.o + udp_offload.o seg6.o fib6_notifier.o ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o @@ -23,7 +23,7 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_PROC_FS) += proc.o ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o ipv6-$(CONFIG_NETLABEL) += calipso.o -ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o +ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 936e9ab4dda5..4a96ebbf8eda 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1399,10 +1399,18 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(struct net *net, + struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + return false; + + return true; #else return false; #endif @@ -1472,7 +1480,7 @@ static int ipv6_get_saddr_eval(struct net *net, /* Rule 3: Avoid deprecated and optimistic addresses */ u8 avoid = IFA_F_DEPRECATED; - if (!ipv6_use_optimistic_addr(score->ifa->idev)) + if (!ipv6_use_optimistic_addr(net, score->ifa->idev)) avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || !(score->ifa->flags & avoid); @@ -2460,7 +2468,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int max_addresses = in6_dev->cnf.max_addresses; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && + if ((net->ipv6.devconf_all->optimistic_dad || + in6_dev->cnf.optimistic_dad) && !net->ipv6.devconf_all->forwarding && sllao) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3030,9 +3039,6 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) static void init_loopback(struct net_device *dev) { struct inet6_dev *idev; - struct net_device *sp_dev; - struct inet6_ifaddr *sp_ifa; - struct rt6_info *sp_rt; /* ::1 */ @@ -3045,45 +3051,6 @@ static void init_loopback(struct net_device *dev) } add_addr(idev, &in6addr_loopback, 128, IFA_HOST); - - /* Add routes to other interface's IPv6 addresses */ - for_each_netdev(dev_net(dev), sp_dev) { - if (!strcmp(sp_dev->name, dev->name)) - continue; - - idev = __in6_dev_get(sp_dev); - if (!idev) - continue; - - read_lock_bh(&idev->lock); - list_for_each_entry(sp_ifa, &idev->addr_list, if_list) { - - if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE)) - continue; - - if (sp_ifa->rt) { - /* This dst has been added to garbage list when - * lo device down, release this obsolete dst and - * reallocate a new router for ifa. - */ - if (!atomic_read(&sp_ifa->rt->rt6i_ref)) { - ip6_rt_put(sp_ifa->rt); - sp_ifa->rt = NULL; - } else { - continue; - } - } - - sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false); - - /* Failure cases are ignored */ - if (!IS_ERR(sp_rt)) { - sp_ifa->rt = sp_rt; - ip6_ins_rt(sp_rt); - } - } - read_unlock_bh(&idev->lock); - } } void addrconf_add_linklocal(struct inet6_dev *idev, @@ -3093,7 +3060,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (idev->cnf.optimistic_dad && + if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || + idev->cnf.optimistic_dad) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3321,11 +3289,11 @@ static void addrconf_gre_config(struct net_device *dev) static int fixup_permanent_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) { - /* rt6i_ref == 0 means the host route was removed from the + /* !rt6i_node means the host route was removed from the * FIB, for example, if 'lo' device is taken down. In that * case regenerate the host route. */ - if (!ifp->rt || !atomic_read(&ifp->rt->rt6i_ref)) { + if (!ifp->rt || !ifp->rt->rt6i_node) { struct rt6_info *rt, *prev; rt = addrconf_dst_alloc(idev, &ifp->addr, false); @@ -3852,7 +3820,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - idev->cnf.accept_dad < 1 || + (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { bump_id = ifp->flags & IFA_F_TENTATIVE; @@ -3883,7 +3852,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) */ if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(idev)) { + if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -3939,7 +3908,9 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || + idev->cnf.accept_dad > 1) && + !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; @@ -4982,9 +4953,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) /* Don't send DELADDR notification for TENTATIVE address, * since NEWADDR notification is sent only after removing - * TENTATIVE flag. + * TENTATIVE flag, if DAD has not failed. */ - if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && + event == RTM_DELADDR) return; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); @@ -6605,21 +6577,21 @@ int __init addrconf_init(void) rtnl_af_register(&inet6_ops); err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, - NULL); + 0); if (err < 0) goto errout; /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL); - __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL); + __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0); + __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0); __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, - inet6_dump_ifaddr, NULL); + inet6_dump_ifaddr, 0); __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, - inet6_dump_ifmcaddr, NULL); + inet6_dump_ifmcaddr, 0); __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, - inet6_dump_ifacaddr, NULL); + inet6_dump_ifacaddr, 0); __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, NULL); + inet6_netconf_dump_devconf, 0); ipv6_addr_label_rtnl_register(); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 7a428f65c7ec..b055bc79f56d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -405,6 +405,18 @@ static const struct nla_policy ifal_policy[IFAL_MAX+1] = { [IFAL_LABEL] = { .len = sizeof(u32), }, }; +static bool addrlbl_ifindex_exists(struct net *net, int ifindex) +{ + + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + rcu_read_unlock(); + + return dev != NULL; +} + static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -439,7 +451,7 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, switch (nlh->nlmsg_type) { case RTM_NEWADDRLABEL: if (ifal->ifal_index && - !__dev_get_by_index(net, ifal->ifal_index)) + !addrlbl_ifindex_exists(net, ifal->ifal_index)) return -EINVAL; err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen, @@ -548,7 +560,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -EINVAL; if (ifal->ifal_index && - !__dev_get_by_index(net, ifal->ifal_index)) + !addrlbl_ifindex_exists(net, ifal->ifal_index)) return -EINVAL; if (!tb[IFAL_ADDRESS]) @@ -593,10 +605,10 @@ out: void __init ipv6_addr_label_rtnl_register(void) { __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, - NULL, NULL); + NULL, RTNL_FLAG_DOIT_UNLOCKED); __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, - NULL, NULL); + NULL, RTNL_FLAG_DOIT_UNLOCKED); __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, - ip6addrlbl_dump, NULL); + ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a88b5b5b7955..fe5262fd6aa5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -210,7 +210,8 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk)); + np->autoflowlabel = ip6_default_np_autolabel(net); + np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets @@ -554,6 +555,8 @@ const struct proto_ops inet6_stream_ops = { .recvmsg = inet_recvmsg, /* ok */ .mmap = sock_no_mmap, .sendpage = inet_sendpage, + .sendmsg_locked = tcp_sendmsg_locked, + .sendpage_locked = tcp_sendpage_locked, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .peek_len = tcp_peek_len, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index ab64f367d11c..89910e2c10f4 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -463,28 +463,30 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) return esp6_output_tail(x, skb, &esp); } -int esp6_input_done2(struct sk_buff *skb, int err) +static inline int esp_remove_trailer(struct sk_buff *skb) { struct xfrm_state *x = xfrm_input_state(skb); struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; - int alen = crypto_aead_authsize(aead); - int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); - int elen = skb->len - hlen; - int hdr_len = skb_network_header_len(skb); - int padlen; + int alen, hlen, elen; + int padlen, trimlen; + __wsum csumdiff; u8 nexthdr[2]; + int ret; - if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) - kfree(ESP_SKB_CB(skb)->tmp); + alen = crypto_aead_authsize(aead); + hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); + elen = skb->len - hlen; - if (unlikely(err)) + if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) { + ret = xo->proto; goto out; + } if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2)) BUG(); - err = -EINVAL; + ret = -EINVAL; padlen = nexthdr[0]; if (padlen + 2 + alen >= elen) { net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n", @@ -492,17 +494,46 @@ int esp6_input_done2(struct sk_buff *skb, int err) goto out; } - /* ... check padding bits here. Silly. :-) */ + trimlen = alen + padlen + 2; + if (skb->ip_summed == CHECKSUM_COMPLETE) { + csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0); + skb->csum = csum_block_sub(skb->csum, csumdiff, + skb->len - trimlen); + } + pskb_trim(skb, skb->len - trimlen); + + ret = nexthdr[1]; + +out: + return ret; +} - pskb_trim(skb, skb->len - alen - padlen - 2); - __skb_pull(skb, hlen); +int esp6_input_done2(struct sk_buff *skb, int err) +{ + struct xfrm_state *x = xfrm_input_state(skb); + struct xfrm_offload *xo = xfrm_offload(skb); + struct crypto_aead *aead = x->data; + int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); + int hdr_len = skb_network_header_len(skb); + + if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + kfree(ESP_SKB_CB(skb)->tmp); + + if (unlikely(err)) + goto out; + + err = esp_remove_trailer(skb); + if (unlikely(err < 0)) + goto out; + + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + skb_pull_rcsum(skb, hlen); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -hdr_len); - err = nexthdr[1]; - /* RFC4303: Drop dummy packets without any error */ if (err == IPPROTO_NONE) err = -EINVAL; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 1cf437f75b0b..333a478aa161 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -209,11 +209,13 @@ out: static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; + struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; - skb->ip_summed = CHECKSUM_NONE; + if (!(xo->flags & CRYPTO_DONE)) + skb->ip_summed = CHECKSUM_NONE; return esp6_input_done2(skb, 0); } @@ -332,3 +334,4 @@ module_init(esp6_offload_init); module_exit(esp6_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); +MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 3cec529c6113..95516138e861 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -882,7 +882,7 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, (hops - 1) * sizeof(struct in6_addr)); sr_phdr->segments[0] = **addr_p; - *addr_p = &sr_ihdr->segments[hops - 1]; + *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { @@ -1174,7 +1174,7 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; - fl6->daddr = srh->segments[srh->first_segment]; + fl6->daddr = srh->segments[srh->segments_left]; break; } default: diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c new file mode 100644 index 000000000000..05f82baaa99e --- /dev/null +++ b/net/ipv6/fib6_notifier.c @@ -0,0 +1,63 @@ +#include <linux/notifier.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <net/net_namespace.h> +#include <net/fib_notifier.h> +#include <net/netns/ipv6.h> +#include <net/ip6_fib.h> + +int call_fib6_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->family = AF_INET6; + return call_fib_notifier(nb, net, event_type, info); +} + +int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->family = AF_INET6; + return call_fib_notifiers(net, event_type, info); +} + +static unsigned int fib6_seq_read(struct net *net) +{ + return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); +} + +static int fib6_dump(struct net *net, struct notifier_block *nb) +{ + int err; + + err = fib6_rules_dump(net, nb); + if (err) + return err; + + return fib6_tables_dump(net, nb); +} + +static const struct fib_notifier_ops fib6_notifier_ops_template = { + .family = AF_INET6, + .fib_seq_read = fib6_seq_read, + .fib_dump = fib6_dump, + .owner = THIS_MODULE, +}; + +int __net_init fib6_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv6.notifier_ops = ops; + + return 0; +} + +void __net_exit fib6_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv6.notifier_ops); +} diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index ec849d88a662..b240f24a6e52 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -14,6 +14,7 @@ */ #include <linux/netdevice.h> +#include <linux/notifier.h> #include <linux/export.h> #include <net/fib_rules.h> @@ -29,22 +30,65 @@ struct fib6_rule { u8 tclass; }; -struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, - int flags, pol_lookup_t lookup) +static bool fib6_rule_matchall(const struct fib_rule *rule) +{ + struct fib6_rule *r = container_of(rule, struct fib6_rule, common); + + if (r->dst.plen || r->src.plen || r->tclass) + return false; + return fib_rule_matchall(rule); +} + +bool fib6_rule_default(const struct fib_rule *rule) { - struct fib_lookup_arg arg = { - .lookup_ptr = lookup, - .flags = FIB_LOOKUP_NOREF, - }; + if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || + rule->l3mdev) + return false; + if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN) + return false; + return true; +} +EXPORT_SYMBOL_GPL(fib6_rule_default); - /* update flow if oif or iif point to device enslaved to l3mdev */ - l3mdev_update_flow(net, flowi6_to_flowi(fl6)); +int fib6_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, AF_INET6); +} - fib_rules_lookup(net->ipv6.fib6_rules_ops, - flowi6_to_flowi(fl6), flags, &arg); +unsigned int fib6_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, AF_INET6); +} - if (arg.result) - return arg.result; +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + int flags, pol_lookup_t lookup) +{ + if (net->ipv6.fib6_has_custom_rules) { + struct fib_lookup_arg arg = { + .lookup_ptr = lookup, + .flags = FIB_LOOKUP_NOREF, + }; + + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(fl6)); + + fib_rules_lookup(net->ipv6.fib6_rules_ops, + flowi6_to_flowi(fl6), flags, &arg); + + if (arg.result) + return arg.result; + } else { + struct rt6_info *rt; + + rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, flags); + if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) + return &rt->dst; + ip6_rt_put(rt); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + if (rt->dst.error != -EAGAIN) + return &rt->dst; + ip6_rt_put(rt); + } dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; @@ -214,6 +258,7 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule6->dst.plen = frh->dst_len; rule6->tclass = frh->tos; + net->ipv6.fib6_has_custom_rules = true; err = 0; errout: return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 8d7b113958b1..5acb54405b10 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -399,6 +399,24 @@ relookup_failed: return ERR_PTR(err); } +static int icmp6_iif(const struct sk_buff *skb) +{ + int iif = skb->dev->ifindex; + + /* for local traffic to local address, skb dev is the loopback + * device. Check if there is a dst attached to the skb and if so + * get the real device index. + */ + if (unlikely(iif == LOOPBACK_IFINDEX)) { + const struct rt6_info *rt6 = skb_rt6_info(skb); + + if (rt6) + iif = rt6->rt6i_idev->dev->ifindex; + } + + return iif; +} + /* * Send an ICMP message in response to a packet in error */ @@ -459,9 +477,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, * Source addr check */ - if (__ipv6_addr_needs_scope_id(addr_type)) - iif = skb->dev->ifindex; - else { + if (__ipv6_addr_needs_scope_id(addr_type)) { + iif = icmp6_iif(skb); + } else { dst = skb_dst(skb); iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev); } @@ -508,6 +526,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); + fl6.mp_hash = rt6_multipath_hash(&fl6, skb); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); @@ -682,7 +701,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; - fl6.flowi6_oif = skb->dev->ifindex; + fl6.flowi6_oif = icmp6_iif(skb); fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 77f7f8c7d93d..5bd419c1abc8 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -208,7 +208,7 @@ ila_nf_input(void *priv, return NF_ACCEPT; } -static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = { +static const struct nf_hook_ops ila_nf_hook_ops[] = { { .hook = ila_nf_input, .pf = NFPROTO_IPV6, diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b13b8f93079d..b01858f5deb1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -56,7 +56,7 @@ struct sock *__inet6_lookup_established(struct net *net, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, - const int dif) + const int dif, const int sdif) { struct sock *sk; const struct hlist_nulls_node *node; @@ -73,12 +73,12 @@ begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; - if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif)) + if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; - if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) { + if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } @@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, - const int dif, bool exact_dif) + const int dif, const int sdif, bool exact_dif) { int score = -1; @@ -110,9 +110,13 @@ static inline int compute_score(struct sock *sk, struct net *net, score++; } if (sk->sk_bound_dev_if || exact_dif) { - if (sk->sk_bound_dev_if != dif) + bool dev_match = (sk->sk_bound_dev_if == dif || + sk->sk_bound_dev_if == sdif); + + if (exact_dif && !dev_match) return -1; - score++; + if (sk->sk_bound_dev_if && dev_match) + score++; } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -126,7 +130,7 @@ struct sock *inet6_lookup_listener(struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif) + const unsigned short hnum, const int dif, const int sdif) { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -136,7 +140,7 @@ struct sock *inet6_lookup_listener(struct net *net, u32 phash = 0; sk_for_each(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif, exact_dif); + score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -171,7 +175,7 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, bool refcounted; sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, - ntohs(dport), dif, &refcounted); + ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -187,8 +191,9 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; const struct in6_addr *saddr = &sk->sk_v6_daddr; const int dif = sk->sk_bound_dev_if; - const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); + const int sdif = l3mdev_master_ifindex_by_index(net, dif); + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); @@ -203,7 +208,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, if (sk2->sk_hash != hash) continue; - if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) { + if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, + dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e1c85bb4eac0..e5308d7cbd75 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -33,6 +33,7 @@ #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> +#include <net/fib_notifier.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> @@ -165,7 +166,7 @@ static void node_free(struct fib6_node *fn) call_rcu(&fn->rcu, node_free_rcu); } -static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) +void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -188,14 +189,12 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) free_percpu(non_pcpu_rt->rt6i_pcpu); non_pcpu_rt->rt6i_pcpu = NULL; } +EXPORT_SYMBOL_GPL(rt6_free_pcpu); -static void rt6_release(struct rt6_info *rt) +static void fib6_free_table(struct fib6_table *table) { - if (atomic_dec_and_test(&rt->rt6i_ref)) { - rt6_free_pcpu(rt); - dst_dev_put(&rt->dst); - dst_release(&rt->dst); - } + inetpeer_invalidate_tree(&table->tb6_peers); + kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) @@ -314,6 +313,109 @@ static void __net_init fib6_tables_init(struct net *net) #endif +unsigned int fib6_tables_seq_read(struct net *net) +{ + unsigned int h, fib_seq = 0; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[h]; + struct fib6_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { + read_lock_bh(&tb->tb6_lock); + fib_seq += tb->fib_seq; + read_unlock_bh(&tb->tb6_lock); + } + } + rcu_read_unlock(); + + return fib_seq; +} + +static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct rt6_info *rt) +{ + struct fib6_entry_notifier_info info = { + .rt = rt, + }; + + return call_fib6_notifier(nb, net, event_type, &info.info); +} + +static int call_fib6_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct rt6_info *rt) +{ + struct fib6_entry_notifier_info info = { + .rt = rt, + }; + + rt->rt6i_table->fib_seq++; + return call_fib6_notifiers(net, event_type, &info.info); +} + +struct fib6_dump_arg { + struct net *net; + struct notifier_block *nb; +}; + +static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) +{ + if (rt == arg->net->ipv6.ip6_null_entry) + return; + call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); +} + +static int fib6_node_dump(struct fib6_walker *w) +{ + struct rt6_info *rt; + + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) + fib6_rt_dump(rt, w->args); + w->leaf = NULL; + return 0; +} + +static void fib6_table_dump(struct net *net, struct fib6_table *tb, + struct fib6_walker *w) +{ + w->root = &tb->tb6_root; + read_lock_bh(&tb->tb6_lock); + fib6_walk(net, w); + read_unlock_bh(&tb->tb6_lock); +} + +/* Called with rcu_read_lock() */ +int fib6_tables_dump(struct net *net, struct notifier_block *nb) +{ + struct fib6_dump_arg arg; + struct fib6_walker *w; + unsigned int h; + + w = kzalloc(sizeof(*w), GFP_ATOMIC); + if (!w) + return -ENOMEM; + + w->func = fib6_node_dump; + arg.net = net; + arg.nb = nb; + w->args = &arg; + + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[h]; + struct fib6_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb6_hlist) + fib6_table_dump(net, tb, w); + } + + kfree(w); + + return 0; +} + static int fib6_dump_node(struct fib6_walker *w) { int res; @@ -745,8 +847,6 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, } fn = fn->parent; } - /* No more references are possible at this point. */ - BUG_ON(atomic_read(&rt->rt6i_ref) != 1); } } @@ -891,6 +991,8 @@ add: *ins = rt; rcu_assign_pointer(rt->rt6i_node, fn); atomic_inc(&rt->rt6i_ref); + call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, + rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; @@ -918,6 +1020,8 @@ add: rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); + call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, + rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { @@ -925,6 +1029,7 @@ add: fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->rt6i_nsiblings; + iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; @@ -939,6 +1044,7 @@ add: break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; + iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; @@ -1473,6 +1579,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fib6_purge_rt(rt, fn, net); + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); @@ -1858,6 +1965,11 @@ static void fib6_gc_timer_cb(unsigned long arg) static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + int err; + + err = fib6_notifier_init(net); + if (err) + return err; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); @@ -1910,22 +2022,31 @@ out_fib_table_hash: out_rt6_stats: kfree(net->ipv6.rt6_stats); out_timer: + fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { + unsigned int i; + rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); - kfree(net->ipv6.fib6_local_tbl); -#endif - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); - kfree(net->ipv6.fib6_main_tbl); + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[i]; + struct hlist_node *tmp; + struct fib6_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { + hlist_del(&tb->tb6_hlist); + fib6_free_table(tb); + } + } + kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); + fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { @@ -1949,7 +2070,7 @@ int __init fib6_init(void) goto out_kmem_cache_create; ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, - NULL); + 0); if (ret) goto out_unregister_subsys; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 67ff2aaf5dcb..1602b491b281 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: - mtu = be32_to_cpu(info) - offset; + mtu = be32_to_cpu(info) - offset - t->tun_hlen; + if (t->dev->type == ARPHRD_ETHER) + mtu -= ETH_HLEN; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; @@ -938,24 +940,25 @@ done: } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(ipv6h+1); + struct ipv6hdr *ipv6h; + __be16 *p; - ip6_flow_hdr(ipv6h, 0, - ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, true, - &t->fl.u.ip6)); + ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); + ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, + true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; - p[0] = t->parms.o_flags; - p[1] = htons(type); + p = (__be16 *)(ipv6h + 1); + p[0] = t->parms.o_flags; + p[1] = htons(type); /* * Set the source hardware address. @@ -1308,6 +1311,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index cdb3728faca7..4a87f9428ca5 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (gso_partial) + if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2dfe50d8d609..43ca864327c7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1110,69 +1110,6 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); -static inline int ip6_ufo_append_data(struct sock *sk, - struct sk_buff_head *queue, - int getfrag(void *from, char *to, int offset, int len, - int odd, struct sk_buff *skb), - void *from, int length, int hh_len, int fragheaderlen, - int exthdrlen, int transhdrlen, int mtu, - unsigned int flags, const struct flowi6 *fl6) - -{ - struct sk_buff *skb; - int err; - - /* There is support for UDP large send offload by network - * device, so create one single skb packet containing complete - * udp datagram - */ - skb = skb_peek_tail(queue); - if (!skb) { - skb = sock_alloc_send_skb(sk, - hh_len + fragheaderlen + transhdrlen + 20, - (flags & MSG_DONTWAIT), &err); - if (!skb) - return err; - - /* reserve space for Hardware header */ - skb_reserve(skb, hh_len); - - /* create space for UDP/IP header */ - skb_put(skb, fragheaderlen + transhdrlen); - - /* initialize network header pointer */ - skb_set_network_header(skb, exthdrlen); - - /* initialize protocol header pointer */ - skb->transport_header = skb->network_header + fragheaderlen; - - skb->protocol = htons(ETH_P_IPV6); - skb->csum = 0; - - if (flags & MSG_CONFIRM) - skb_set_dst_pending_confirm(skb, 1); - - __skb_queue_tail(queue, skb); - } else if (skb_is_gso(skb)) { - goto append; - } - - skb->ip_summed = CHECKSUM_PARTIAL; - /* Specify the length of each IPv6 datagram fragment. - * It has to be a multiple of 8. - */ - skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - - sizeof(struct frag_hdr)) & ~7; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), - &fl6->daddr, - &fl6->saddr); - -append: - return skb_append_datato_frags(sk, skb, getfrag, from, - (length - transhdrlen)); -} - static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, gfp_t gfp) { @@ -1381,20 +1318,6 @@ emsgsize: */ cork->length += length; - if ((skb && skb_is_gso(skb)) || - (((length + (skb ? skb->len : headersize)) > mtu) && - (skb_queue_len(queue) <= 1) && - (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && - (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) { - err = ip6_ufo_append_data(sk, queue, getfrag, from, length, - hh_len, fragheaderlen, exthdrlen, - transhdrlen, mtu, flags, fl6); - if (err) - goto error; - return 0; - } - if (!skb) goto alloc_new_skb; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3a0ba2ae4b0f..a1c24443cd9e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -171,7 +171,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_ } t = rcu_dereference(ip6n->collect_md_tun); - if (t) + if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); @@ -1043,6 +1043,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1124,7 +1125,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1133,7 +1134,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; @@ -1184,6 +1185,7 @@ route_lookup: init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } + hop_limit = hop_limit ? : ip6_dst_hoplimit(dst); /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. @@ -2258,6 +2260,9 @@ static int __init ip6_tunnel_init(void) { int err; + if (!ipv6_mod_enabled()) + return -EOPNOTSUPP; + err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 486c2305f53c..bcdc2d557de1 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -502,7 +503,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { @@ -1145,33 +1146,6 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .priority = 100, }; -static bool is_vti6_tunnel(const struct net_device *dev) -{ - return dev->netdev_ops == &vti6_netdev_ops; -} - -static int vti6_device_event(struct notifier_block *unused, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct ip6_tnl *t = netdev_priv(dev); - - if (!is_vti6_tunnel(dev)) - return NOTIFY_DONE; - - switch (event) { - case NETDEV_DOWN: - if (!net_eq(t->net, dev_net(dev))) - xfrm_garbage_collect(t->net); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block vti6_notifier_block __read_mostly = { - .notifier_call = vti6_device_event, -}; - /** * vti6_tunnel_init - register protocol and reserve needed resources * @@ -1182,8 +1156,6 @@ static int __init vti6_tunnel_init(void) const char *msg; int err; - register_netdevice_notifier(&vti6_notifier_block); - msg = "tunnel device"; err = register_pernet_device(&vti6_net_ops); if (err < 0) @@ -1216,7 +1188,6 @@ xfrm_proto_ah_failed: xfrm_proto_esp_failed: unregister_pernet_device(&vti6_net_ops); pernet_dev_failed: - unregister_netdevice_notifier(&vti6_notifier_block); pr_err("vti6 init: failed to register %s\n", msg); return err; } @@ -1231,7 +1202,6 @@ static void __exit vti6_tunnel_cleanup(void) xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); unregister_pernet_device(&vti6_net_ops); - unregister_netdevice_notifier(&vti6_notifier_block); } module_init(vti6_tunnel_init); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 7454850f2098..f5500f5444e9 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1427,7 +1427,7 @@ int __init ip6_mr_init(void) } #endif rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, - ip6mr_rtm_dumproute, NULL); + ip6mr_rtm_dumproute, 0); return 0; #ifdef CONFIG_IPV6_PIMSM_V2 add_proto_fail: diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0327c1f2e6fc..266a530414d7 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -127,7 +127,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, @@ -1779,6 +1779,7 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, static struct notifier_block ndisc_netdev_notifier = { .notifier_call = ndisc_netdev_event, + .priority = ADDRCONF_NOTIFY_PRIORITY - 5, }; #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1f90644056ac..01bd3ee5ebc6 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -39,12 +39,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); -#ifdef CONFIG_NETFILTER_DEBUG -#define IP_NF_ASSERT(x) WARN_ON(!(x)) -#else -#define IP_NF_ASSERT(x) -#endif - void *ip6t_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ip6t, IP6T); @@ -176,7 +170,7 @@ static const char *const comments[] = { [NF_IP6_TRACE_COMMENT_POLICY] = "policy", }; -static struct nf_loginfo trace_loginfo = { +static const struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { @@ -284,7 +278,7 @@ ip6t_do_table(struct sk_buff *skb, acpar.hotdrop = false; acpar.state = state; - IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); @@ -315,7 +309,7 @@ ip6t_do_table(struct sk_buff *skb, const struct xt_entry_match *ematch; struct xt_counters *counter; - IP_NF_ASSERT(e); + WARN_ON(!e); acpar.thoff = 0; if (!ip6_packet_match(skb, indev, outdev, &e->ipv6, &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) { @@ -335,7 +329,7 @@ ip6t_do_table(struct sk_buff *skb, ADD_COUNTER(*counter, skb->len, 1); t = ip6t_get_target_c(e); - IP_NF_ASSERT(t->u.kernel.target); + WARN_ON(!t->u.kernel.target); #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ @@ -801,6 +795,7 @@ get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; + cond_resched(); } } } diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index ce203dd729e0..437af8c95277 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -353,7 +353,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, nexthdr = ipv6_hdr(skb)->nexthdr; thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (thoff < 0) + if (thoff < 0 || nexthdr != IPPROTO_TCP) return NF_ACCEPT; th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); @@ -438,7 +438,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, return NF_ACCEPT; } -static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { +static const struct nf_hook_ops ipv6_synproxy_ops[] = { { .hook = ipv6_synproxy_hook, .pf = NFPROTO_IPV6, diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 7d2bd940291f..991512576c8c 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -69,7 +69,7 @@ static unsigned int ip6table_nat_local_fn(void *priv, return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain); } -static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { +static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = ip6table_nat_in, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 4e3402486833..fe01dc953c56 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -67,13 +67,6 @@ static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -static void ipv6_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "src=%pI6 dst=%pI6 ", - tuple->src.u3.ip6, tuple->dst.u3.ip6); -} - static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -191,7 +184,7 @@ static unsigned int ipv6_conntrack_local(void *priv, return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } -static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { +static const struct nf_hook_ops ipv6_conntrack_ops[] = { { .hook = ipv6_conntrack_in, .pf = NFPROTO_IPV6, @@ -308,11 +301,6 @@ static int ipv6_nlattr_to_tuple(struct nlattr *tb[], return 0; } - -static int ipv6_nlattr_tuple_size(void) -{ - return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1); -} #endif static int ipv6_hooks_register(struct net *net) @@ -353,16 +341,15 @@ static void ipv6_hooks_unregister(struct net *net) struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .l3proto = PF_INET6, - .name = "ipv6", .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, - .print_tuple = ipv6_print_tuple, .get_l4proto = ipv6_get_l4proto, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv6_tuple_to_nlattr, - .nlattr_tuple_size = ipv6_nlattr_tuple_size, .nlattr_to_tuple = ipv6_nlattr_to_tuple, .nla_policy = ipv6_nla_policy, + .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) + + NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])), #endif .net_ns_get = ipv6_hooks_register, .net_ns_put = ipv6_hooks_unregister, @@ -398,25 +385,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto6[] = { static int ipv6_net_init(struct net *net) { - int ret = 0; - - ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - if (ret < 0) - return ret; - - ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: pernet registration failed.\n"); - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - } - return ret; + return nf_ct_l4proto_pernet_register(net, builtin_l4proto6, + ARRAY_SIZE(builtin_l4proto6)); } static void ipv6_net_exit(struct net *net) { - nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6); nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, ARRAY_SIZE(builtin_l4proto6)); } @@ -434,6 +408,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) need_conntrack(); +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) != + nf_conntrack_l3proto_ipv6.nla_size)) + return -EINVAL; +#endif + ret = nf_register_sockopt(&so_getorigdst6); if (ret < 0) { pr_err("Unable to register netfilter socket option\n"); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index d5f028e33f65..a9e1fd1a8536 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -84,16 +84,6 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -/* Print out the per-protocol part of the tuple. */ -static void icmpv6_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "type=%u code=%u id=%u ", - tuple->dst.u.icmp.type, - tuple->dst.u.icmp.code, - ntohs(tuple->src.u.icmp.id)); -} - static unsigned int *icmpv6_get_timeouts(struct net *net) { return &icmpv6_pernet(net)->timeout; @@ -105,7 +95,6 @@ static int icmpv6_packet(struct nf_conn *ct, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, - unsigned int hooknum, unsigned int *timeout) { /* Do not immediately delete the connection after the first @@ -131,11 +120,6 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); - if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6)) - nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL, - NULL, NULL, - "nf_ct_icmpv6: invalid new with type %d ", - type + 128); return false; } return true; @@ -144,8 +128,7 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, static int icmpv6_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int icmp6off, - unsigned int hooknum) + unsigned int icmp6off) { struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; @@ -153,7 +136,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; - NF_CT_ASSERT(!skb_nfct(skb)); + WARN_ON(skb_nfct(skb)); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, @@ -229,7 +212,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum); + return icmpv6_error_message(net, tmpl, skb, dataoff); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -367,10 +350,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_ICMPV6, - .name = "icmpv6", .pkt_to_tuple = icmpv6_pkt_to_tuple, .invert_tuple = icmpv6_invert_tuple, - .print_tuple = icmpv6_print_tuple, .packet = icmpv6_packet, .get_timeouts = icmpv6_get_timeouts, .new = icmpv6_new, diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 986d4ca38832..b263bf3a19f7 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { - int res; - net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - res = inet_frags_init_net(&net->nf_frag.frags); - if (res) - return res; - res = nf_ct_frag6_sysctl_register(net); - if (res) - inet_frags_uninit_net(&net->nf_frag.frags); - return res; + inet_frags_init_net(&net->nf_frag.frags); + + return nf_ct_frag6_sysctl_register(net); } static void nf_ct_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index ada60d1a991b..b326da59257f 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -74,7 +74,7 @@ static unsigned int ipv6_defrag(void *priv, return err == 0 ? NF_ACCEPT : NF_DROP; } -static struct nf_hook_ops ipv6_defrag_ops[] = { +static const struct nf_hook_ops ipv6_defrag_ops[] = { { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 97c724224da7..b397a8fe88b9 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -25,7 +25,7 @@ #include <linux/netfilter/xt_LOG.h> #include <net/netfilter/nf_log.h> -static struct nf_loginfo default_loginfo = { +static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index b2b4f031b3a1..46d6dba50698 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -196,7 +196,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conntrack_tuple target; unsigned long statusbit; - NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY); + WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) return 0; @@ -319,8 +319,8 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, default: /* ESTABLISHED */ - NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == IP_CT_ESTABLISHED_REPLY); + WARN_ON(ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index d7b679037bae..98f61fcb9108 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -36,8 +36,8 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, struct nf_nat_range newrange; ct = nf_ct_get(skb, &ctinfo); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)); + WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY))); if (ipv6_dev_get_saddr(nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &src) < 0) diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 43f91d9b086c..54b5899543ef 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -25,9 +25,9 @@ static int get_ifindex(const struct net_device *dev) static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, const struct nft_pktinfo *pkt, - const struct net_device *dev) + const struct net_device *dev, + struct ipv6hdr *iph) { - const struct ipv6hdr *iph = ipv6_hdr(pkt->skb); int lookup_flags = 0; if (priv->flags & NFTA_FIB_F_DADDR) { @@ -55,7 +55,8 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, } static u32 __nft_fib6_eval_type(const struct nft_fib *priv, - const struct nft_pktinfo *pkt) + const struct nft_pktinfo *pkt, + struct ipv6hdr *iph) { const struct net_device *dev = NULL; const struct nf_ipv6_ops *v6ops; @@ -77,7 +78,7 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); - nft_fib6_flowi_init(&fl6, priv, pkt, dev); + nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); v6ops = nf_get_ipv6_ops(); if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) @@ -131,9 +132,17 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_fib *priv = nft_expr_priv(expr); + int noff = skb_network_offset(pkt->skb); u32 *dest = ®s->data[priv->dreg]; + struct ipv6hdr *iph, _iph; - *dest = __nft_fib6_eval_type(priv, pkt); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); + if (!iph) { + regs->verdict.code = NFT_BREAK; + return; + } + + *dest = __nft_fib6_eval_type(priv, pkt, iph); } EXPORT_SYMBOL_GPL(nft_fib6_eval_type); @@ -141,8 +150,10 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_fib *priv = nft_expr_priv(expr); + int noff = skb_network_offset(pkt->skb); const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; + struct ipv6hdr *iph, _iph; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, @@ -155,7 +166,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, else if (priv->flags & NFTA_FIB_F_OIF) oif = nft_out(pkt); - lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); + if (!iph) { + regs->verdict.code = NFT_BREAK; + return; + } + + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 60be012fe708..e4462b0ff801 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(raw_v6_hashinfo); struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, unsigned short num, const struct in6_addr *loc_addr, - const struct in6_addr *rmt_addr, int dif) + const struct in6_addr *rmt_addr, int dif, int sdif) { bool is_multicast = ipv6_addr_is_multicast(loc_addr); @@ -86,7 +86,9 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) continue; - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != dif && + sk->sk_bound_dev_if != sdif) continue; if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { @@ -178,7 +180,8 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) goto out; net = dev_net(skb->dev); - sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, inet6_iif(skb)); + sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, + inet6_iif(skb), inet6_sdif(skb)); while (sk) { int filtered; @@ -222,7 +225,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) } } sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr, - inet6_iif(skb)); + inet6_iif(skb), inet6_sdif(skb)); } out: read_unlock(&raw_v6_hashinfo.lock); @@ -378,7 +381,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, net = dev_net(skb->dev); while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr, - inet6_iif(skb)))) { + inet6_iif(skb), inet6_iif(skb)))) { rawv6_err(sk, skb, NULL, type, code, inner_offset, info); sk = sk_next(sk); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e1da5b888cc4..846012eae526 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -714,19 +714,13 @@ static void ip6_frags_sysctl_unregister(void) static int __net_init ipv6_frags_init_net(struct net *net) { - int res; - net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - res = inet_frags_init_net(&net->ipv6.frags); - if (res) - return res; - res = ip6_frags_ns_sysctl_register(net); - if (res) - inet_frags_uninit_net(&net->ipv6.frags); - return res; + inet_frags_init_net(&net->ipv6.frags); + + return ip6_frags_ns_sysctl_register(net); } static void __net_exit ipv6_frags_exit_net(struct net *net) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2d0e7798c793..a96d5b385d8f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -446,16 +446,6 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -/* Multipath route selection: - * Hash based function using packet header and flowlabel. - * Adapted from fib_info_hashfn() - */ -static int rt6_info_hash_nhsfn(unsigned int candidate_count, - const struct flowi6 *fl6) -{ - return get_hash_from_flowi6(fl6) % candidate_count; -} - static struct rt6_info *rt6_multipath_select(struct rt6_info *match, struct flowi6 *fl6, int oif, int strict) @@ -463,7 +453,13 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, struct rt6_info *sibling, *next_sibling; int route_choosen; - route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6); + /* We might have already computed the hash for ICMPv6 errors. In such + * case it will always be non-zero. Otherwise now is the time to do it. + */ + if (!fl6->mp_hash) + fl6->mp_hash = rt6_multipath_hash(fl6, NULL); + + route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); /* Don't change the route, if route_choosen == 0 * (siblings does not include ourself) */ @@ -959,10 +955,34 @@ int ip6_ins_rt(struct rt6_info *rt) return __ip6_ins_rt(rt, &info, &mxc, NULL); } +/* called with rcu_lock held */ +static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) +{ + struct net_device *dev = rt->dst.dev; + + if (rt->rt6i_flags & RTF_LOCAL) { + /* for copies of local routes, dst->dev needs to be the + * device if it is a master device, the master device if + * device is enslaved, and the loopback as the default + */ + if (netif_is_l3_slave(dev) && + !rt6_need_strict(&rt->rt6i_dst.addr)) + dev = l3mdev_master_dev_rcu(dev); + else if (!netif_is_l3_master(dev)) + dev = dev_net(dev)->loopback_dev; + /* last case is netif_is_l3_master(dev) is true in which + * case we want dev returned to be dev + */ + } + + return dev; +} + static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct net_device *dev; struct rt6_info *rt; /* @@ -972,8 +992,10 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) ort = (struct rt6_info *)ort->dst.from; - rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0); - + rcu_read_lock(); + dev = ip6_rt_get_dev_rcu(ort); + rt = __ip6_dst_alloc(dev_net(dev), dev, 0); + rcu_read_unlock(); if (!rt) return NULL; @@ -1001,11 +1023,13 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) { + struct net_device *dev; struct rt6_info *pcpu_rt; - pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), - rt->dst.dev, rt->dst.flags); - + rcu_read_lock(); + dev = ip6_rt_get_dev_rcu(rt); + pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); + rcu_read_unlock(); if (!pcpu_rt) return NULL; ip6_rt_copy_init(pcpu_rt, rt); @@ -1187,6 +1211,54 @@ struct dst_entry *ip6_route_input_lookup(struct net *net, } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); +static void ip6_multipath_l3_keys(const struct sk_buff *skb, + struct flow_keys *keys) +{ + const struct ipv6hdr *outer_iph = ipv6_hdr(skb); + const struct ipv6hdr *key_iph = outer_iph; + const struct ipv6hdr *inner_iph; + const struct icmp6hdr *icmph; + struct ipv6hdr _inner_iph; + + if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) + goto out; + + icmph = icmp6_hdr(skb); + if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && + icmph->icmp6_type != ICMPV6_PKT_TOOBIG && + icmph->icmp6_type != ICMPV6_TIME_EXCEED && + icmph->icmp6_type != ICMPV6_PARAMPROB) + goto out; + + inner_iph = skb_header_pointer(skb, + skb_transport_offset(skb) + sizeof(*icmph), + sizeof(_inner_iph), &_inner_iph); + if (!inner_iph) + goto out; + + key_iph = inner_iph; +out: + memset(keys, 0, sizeof(*keys)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->addrs.v6addrs.src = key_iph->saddr; + keys->addrs.v6addrs.dst = key_iph->daddr; + keys->tags.flow_label = ip6_flowinfo(key_iph); + keys->basic.ip_proto = key_iph->nexthdr; +} + +/* if skb is set it will be used and fl6 can be NULL */ +u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) +{ + struct flow_keys hash_keys; + + if (skb) { + ip6_multipath_l3_keys(skb, &hash_keys); + return flow_hash_from_keys(&hash_keys); + } + + return get_hash_from_flowi6(fl6); +} + void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -1205,6 +1277,8 @@ void ip6_route_input(struct sk_buff *skb) tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; + if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) + fl6.mp_hash = rt6_multipath_hash(&fl6, skb); skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } @@ -1251,7 +1325,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, - DST_OBSOLETE_NONE, 0); + DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); @@ -2698,15 +2772,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, { u32 tb_id; struct net *net = dev_net(idev->dev); - struct net_device *dev = net->loopback_dev; + struct net_device *dev = idev->dev; struct rt6_info *rt; - /* use L3 Master device as loopback for host routes if device - * is enslaved and address is not link local or multicast - */ - if (!rt6_need_strict(addr)) - dev = l3mdev_master_dev_rcu(idev->dev) ? : dev; - rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); if (!rt) return ERR_PTR(-ENOMEM); @@ -3337,6 +3405,9 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, goto nla_put_failure; } + if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) + *flags |= RTNH_F_OFFLOAD; + /* not needed for multipath encoding b/c it has a rtnexthop struct */ if (!skip_oif && rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) @@ -3615,8 +3686,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net_device *dev; int flags = 0; - dev = __dev_get_by_index(net, iif); + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, iif); if (!dev) { + rcu_read_unlock(); err = -ENODEV; goto errout; } @@ -3628,15 +3702,19 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!fibmatch) dst = ip6_route_input_lookup(net, dev, &fl6, flags); + else + dst = ip6_route_lookup(net, &fl6, 0); + + rcu_read_unlock(); } else { fl6.flowi6_oif = oif; if (!fibmatch) dst = ip6_route_output(net, NULL, &fl6); + else + dst = ip6_route_lookup(net, &fl6, 0); } - if (fibmatch) - dst = ip6_route_lookup(net, &fl6, 0); rt = container_of(dst, struct rt6_info, dst); if (rt->dst.error) { @@ -3928,6 +4006,7 @@ static int __net_init ip6_route_net_init(struct net *net) ip6_template_metrics, true); #ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.fib6_has_custom_rules = false; net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, sizeof(*net->ipv6.ip6_prohibit_entry), GFP_KERNEL); @@ -4103,9 +4182,10 @@ int __init ip6_route_init(void) goto fib6_rules_init; ret = -ENOBUFS; - if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) || - __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) || - __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL)) + if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || + __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || + __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED)) goto out_register_late_subsys; ret = register_netdevice_notifier(&ip6_route_dev_notifier); diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 15fba55e3da8..c81407770956 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -40,7 +40,7 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) if (((srh->hdrlen + 1) << 3) != len) return false; - if (srh->segments_left != srh->first_segment) + if (srh->segments_left > srh->first_segment) return false; tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); @@ -456,6 +456,10 @@ int __init seg6_init(void) err = seg6_iptunnel_init(); if (err) goto out_unregister_pernet; + + err = seg6_local_init(); + if (err) + goto out_unregister_pernet; #endif #ifdef CONFIG_IPV6_SEG6_HMAC @@ -471,6 +475,7 @@ out: #ifdef CONFIG_IPV6_SEG6_HMAC out_unregister_iptun: #ifdef CONFIG_IPV6_SEG6_LWTUNNEL + seg6_local_exit(); seg6_iptunnel_exit(); #endif #endif diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index f950cb53d5e3..33fb35cbfac1 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -12,6 +12,7 @@ */ #include <linux/errno.h> +#include <linux/kernel.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -110,7 +111,7 @@ static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) struct seg6_hmac_algo *algo; int i, alg_count; - alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); + alg_count = ARRAY_SIZE(hmac_algos); for (i = 0; i < alg_count; i++) { algo = &hmac_algos[i]; if (algo->alg_id == alg_id) @@ -360,7 +361,7 @@ static int seg6_hmac_init_algo(void) struct shash_desc *shash; int i, alg_count, cpu; - alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); + alg_count = ARRAY_SIZE(hmac_algos); for (i = 0; i < alg_count; i++) { struct crypto_shash **p_tfm; @@ -421,7 +422,7 @@ void seg6_hmac_exit(void) struct seg6_hmac_algo *algo = NULL; int i, alg_count, cpu; - alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); + alg_count = ARRAY_SIZE(hmac_algos); for (i = 0; i < alg_count; i++) { algo = &hmac_algos[i]; for_each_possible_cpu(cpu) { diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 264d772d3c7d..bd6cc688bd19 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -91,7 +91,7 @@ static void set_tun_src(struct net *net, struct net_device *dev, } /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ -static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) { struct net *net = dev_net(skb_dst(skb)->dev); struct ipv6hdr *hdr, *inner_hdr; @@ -116,15 +116,22 @@ static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) * hlim will be decremented in ip6_forward() afterwards and * decapsulation will overwrite inner hlim with outer hlim */ - ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), - ip6_flowlabel(inner_hdr)); - hdr->hop_limit = inner_hdr->hop_limit; + + if (skb->protocol == htons(ETH_P_IPV6)) { + ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), + ip6_flowlabel(inner_hdr)); + hdr->hop_limit = inner_hdr->hop_limit; + } else { + ip6_flow_hdr(hdr, 0, 0); + hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); + } + hdr->nexthdr = NEXTHDR_ROUTING; isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, osrh, hdrlen); - isrh->nexthdr = NEXTHDR_IPV6; + isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr); @@ -141,10 +148,10 @@ static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) return 0; } +EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* insert an SRH within an IPv6 packet, just after the IPv6 header */ -#ifdef CONFIG_IPV6_SEG6_INLINE -static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; @@ -193,13 +200,13 @@ static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) return 0; } -#endif +EXPORT_SYMBOL_GPL(seg6_do_srh_inline); static int seg6_do_srh(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; - int err = 0; + int proto, err = 0; tinfo = seg6_encap_lwtunnel(dst->lwtstate); @@ -209,19 +216,47 @@ static int seg6_do_srh(struct sk_buff *skb) } switch (tinfo->mode) { -#ifdef CONFIG_IPV6_SEG6_INLINE case SEG6_IPTUN_MODE_INLINE: + if (skb->protocol != htons(ETH_P_IPV6)) + return -EINVAL; + err = seg6_do_srh_inline(skb, tinfo->srh); + if (err) + return err; + skb_reset_inner_headers(skb); break; -#endif case SEG6_IPTUN_MODE_ENCAP: - err = seg6_do_srh_encap(skb, tinfo->srh); + if (skb->protocol == htons(ETH_P_IPV6)) + proto = IPPROTO_IPV6; + else if (skb->protocol == htons(ETH_P_IP)) + proto = IPPROTO_IPIP; + else + return -EINVAL; + + err = seg6_do_srh_encap(skb, tinfo->srh, proto); + if (err) + return err; + + skb->protocol = htons(ETH_P_IPV6); break; - } + case SEG6_IPTUN_MODE_L2ENCAP: + if (!skb_mac_header_was_set(skb)) + return -EINVAL; - if (err) - return err; + if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0) + return -ENOMEM; + + skb_mac_header_rebuild(skb); + skb_push(skb, skb->mac_len); + + err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE); + if (err) + return err; + + skb->protocol = htons(ETH_P_IPV6); + break; + } ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); @@ -336,6 +371,9 @@ static int seg6_build_state(struct nlattr *nla, struct seg6_lwt *slwt; int err; + if (family != AF_INET && family != AF_INET6) + return -EINVAL; + err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla, seg6_iptunnel_policy, extack); @@ -357,12 +395,15 @@ static int seg6_build_state(struct nlattr *nla, return -EINVAL; switch (tuninfo->mode) { -#ifdef CONFIG_IPV6_SEG6_INLINE case SEG6_IPTUN_MODE_INLINE: + if (family != AF_INET6) + return -EINVAL; + break; -#endif case SEG6_IPTUN_MODE_ENCAP: break; + case SEG6_IPTUN_MODE_L2ENCAP: + break; default: return -EINVAL; } @@ -386,8 +427,11 @@ static int seg6_build_state(struct nlattr *nla, memcpy(&slwt->tuninfo, tuninfo, tuninfo_len); newts->type = LWTUNNEL_ENCAP_SEG6; - newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | - LWTUNNEL_STATE_INPUT_REDIRECT; + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + + if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP) + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + newts->headroom = seg6_lwt_headroom(tuninfo); *ts = newts; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c new file mode 100644 index 000000000000..825b8e01f947 --- /dev/null +++ b/net/ipv6/seg6_local.c @@ -0,0 +1,934 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun <david.lebrun@uclouvain.be> + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/net.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/lwtunnel.h> +#include <net/netevent.h> +#include <net/netns/generic.h> +#include <net/ip6_fib.h> +#include <net/route.h> +#include <net/seg6.h> +#include <linux/seg6.h> +#include <linux/seg6_local.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> +#include <net/dst_cache.h> +#ifdef CONFIG_IPV6_SEG6_HMAC +#include <net/seg6_hmac.h> +#endif +#include <linux/etherdevice.h> + +struct seg6_local_lwt; + +struct seg6_action_desc { + int action; + unsigned long attrs; + int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt); + int static_headroom; +}; + +struct seg6_local_lwt { + int action; + struct ipv6_sr_hdr *srh; + int table; + struct in_addr nh4; + struct in6_addr nh6; + int iif; + int oif; + + int headroom; + struct seg6_action_desc *desc; +}; + +static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct seg6_local_lwt *)lwt->data; +} + +static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) +{ + struct ipv6_sr_hdr *srh; + int len, srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return NULL; + + if (!pskb_may_pull(skb, srhoff + sizeof(*srh))) + return NULL; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + len = (srh->hdrlen + 1) << 3; + + if (!pskb_may_pull(skb, srhoff + len)) + return NULL; + + if (!seg6_validate_srh(srh, len)) + return NULL; + + return srh; +} + +static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb) +{ + struct ipv6_sr_hdr *srh; + + srh = get_srh(skb); + if (!srh) + return NULL; + + if (srh->segments_left == 0) + return NULL; + +#ifdef CONFIG_IPV6_SEG6_HMAC + if (!seg6_hmac_validate_skb(skb)) + return NULL; +#endif + + return srh; +} + +static bool decap_and_validate(struct sk_buff *skb, int proto) +{ + struct ipv6_sr_hdr *srh; + unsigned int off = 0; + + srh = get_srh(skb); + if (srh && srh->segments_left > 0) + return false; + +#ifdef CONFIG_IPV6_SEG6_HMAC + if (srh && !seg6_hmac_validate_skb(skb)) + return false; +#endif + + if (ipv6_find_hdr(skb, &off, proto, NULL, NULL) < 0) + return false; + + if (!pskb_pull(skb, off)) + return false; + + skb_postpull_rcsum(skb, skb_network_header(skb), off); + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + return true; +} + +static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) +{ + struct in6_addr *addr; + + srh->segments_left--; + addr = srh->segments + srh->segments_left; + *daddr = *addr; +} + +static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id) +{ + struct net *net = dev_net(skb->dev); + struct ipv6hdr *hdr = ipv6_hdr(skb); + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct dst_entry *dst = NULL; + struct rt6_info *rt; + struct flowi6 fl6; + + fl6.flowi6_iif = skb->dev->ifindex; + fl6.daddr = nhaddr ? *nhaddr : hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + if (nhaddr) + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; + + if (!tbl_id) { + dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags); + } else { + struct fib6_table *table; + + table = fib6_get_table(net, tbl_id); + if (!table) + goto out; + + rt = ip6_pol_route(net, table, 0, &fl6, flags); + dst = &rt->dst; + } + + if (dst && dst->dev->flags & IFF_LOOPBACK && !dst->error) { + dst_release(dst); + dst = NULL; + } + +out: + if (!dst) { + rt = net->ipv6.ip6_blk_hole_entry; + dst = &rt->dst; + dst_hold(dst); + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); +} + +/* regular endpoint function */ +static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +/* regular endpoint, and forward to specified nexthop */ +static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + lookup_nexthop(skb, &slwt->nh6, 0); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + lookup_nexthop(skb, NULL, slwt->table); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +/* decapsulate and forward inner L2 frame on specified interface */ +static int input_action_end_dx2(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct net *net = dev_net(skb->dev); + struct net_device *odev; + struct ethhdr *eth; + + if (!decap_and_validate(skb, NEXTHDR_NONE)) + goto drop; + + if (!pskb_may_pull(skb, ETH_HLEN)) + goto drop; + + skb_reset_mac_header(skb); + eth = (struct ethhdr *)skb->data; + + /* To determine the frame's protocol, we assume it is 802.3. This avoids + * a call to eth_type_trans(), which is not really relevant for our + * use case. + */ + if (!eth_proto_is_802_3(eth->h_proto)) + goto drop; + + odev = dev_get_by_index_rcu(net, slwt->oif); + if (!odev) + goto drop; + + /* As we accept Ethernet frames, make sure the egress device is of + * the correct type. + */ + if (odev->type != ARPHRD_ETHER) + goto drop; + + if (!(odev->flags & IFF_UP) || !netif_carrier_ok(odev)) + goto drop; + + skb_orphan(skb); + + if (skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + if (skb->len - ETH_HLEN > odev->mtu) + goto drop; + + skb->dev = odev; + skb->protocol = eth->h_proto; + + return dev_queue_xmit(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +/* decapsulate and forward to specified nexthop */ +static int input_action_end_dx6(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct in6_addr *nhaddr = NULL; + + /* this function accepts IPv6 encapsulated packets, with either + * an SRH with SL=0, or no SRH. + */ + + if (!decap_and_validate(skb, IPPROTO_IPV6)) + goto drop; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + + /* The inner packet is not associated to any local interface, + * so we do not call netif_rx(). + * + * If slwt->nh6 is set to ::, then lookup the nexthop for the + * inner packet's DA. Otherwise, use the specified nexthop. + */ + + if (!ipv6_addr_any(&slwt->nh6)) + nhaddr = &slwt->nh6; + + lookup_nexthop(skb, nhaddr, 0); + + return dst_input(skb); +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int input_action_end_dx4(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct iphdr *iph; + __be32 nhaddr; + int err; + + if (!decap_and_validate(skb, IPPROTO_IPIP)) + goto drop; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + + skb->protocol = htons(ETH_P_IP); + + iph = ip_hdr(skb); + + nhaddr = slwt->nh4.s_addr ?: iph->daddr; + + skb_dst_drop(skb); + + err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); + if (err) + goto drop; + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int input_action_end_dt6(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + if (!decap_and_validate(skb, IPPROTO_IPV6)) + goto drop; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + + lookup_nexthop(skb, NULL, slwt->table); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +/* push an SRH on top of the current one */ +static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + int err = -EINVAL; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + + err = seg6_do_srh_inline(skb, slwt->srh); + if (err) + goto drop; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return err; +} + +/* encapsulate within an outer IPv6 header and a specified SRH */ +static int input_action_end_b6_encap(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + int err = -EINVAL; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + + err = seg6_do_srh_encap(skb, slwt->srh, IPPROTO_IPV6); + if (err) + goto drop; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return err; +} + +static struct seg6_action_desc seg6_action_table[] = { + { + .action = SEG6_LOCAL_ACTION_END, + .attrs = 0, + .input = input_action_end, + }, + { + .action = SEG6_LOCAL_ACTION_END_X, + .attrs = (1 << SEG6_LOCAL_NH6), + .input = input_action_end_x, + }, + { + .action = SEG6_LOCAL_ACTION_END_T, + .attrs = (1 << SEG6_LOCAL_TABLE), + .input = input_action_end_t, + }, + { + .action = SEG6_LOCAL_ACTION_END_DX2, + .attrs = (1 << SEG6_LOCAL_OIF), + .input = input_action_end_dx2, + }, + { + .action = SEG6_LOCAL_ACTION_END_DX6, + .attrs = (1 << SEG6_LOCAL_NH6), + .input = input_action_end_dx6, + }, + { + .action = SEG6_LOCAL_ACTION_END_DX4, + .attrs = (1 << SEG6_LOCAL_NH4), + .input = input_action_end_dx4, + }, + { + .action = SEG6_LOCAL_ACTION_END_DT6, + .attrs = (1 << SEG6_LOCAL_TABLE), + .input = input_action_end_dt6, + }, + { + .action = SEG6_LOCAL_ACTION_END_B6, + .attrs = (1 << SEG6_LOCAL_SRH), + .input = input_action_end_b6, + }, + { + .action = SEG6_LOCAL_ACTION_END_B6_ENCAP, + .attrs = (1 << SEG6_LOCAL_SRH), + .input = input_action_end_b6_encap, + .static_headroom = sizeof(struct ipv6hdr), + } +}; + +static struct seg6_action_desc *__get_action_desc(int action) +{ + struct seg6_action_desc *desc; + int i, count; + + count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc); + for (i = 0; i < count; i++) { + desc = &seg6_action_table[i]; + if (desc->action == action) + return desc; + } + + return NULL; +} + +static int seg6_local_input(struct sk_buff *skb) +{ + struct dst_entry *orig_dst = skb_dst(skb); + struct seg6_action_desc *desc; + struct seg6_local_lwt *slwt; + + if (skb->protocol != htons(ETH_P_IPV6)) { + kfree_skb(skb); + return -EINVAL; + } + + slwt = seg6_local_lwtunnel(orig_dst->lwtstate); + desc = slwt->desc; + + return desc->input(skb, slwt); +} + +static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { + [SEG6_LOCAL_ACTION] = { .type = NLA_U32 }, + [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, + [SEG6_LOCAL_TABLE] = { .type = NLA_U32 }, + [SEG6_LOCAL_NH4] = { .type = NLA_BINARY, + .len = sizeof(struct in_addr) }, + [SEG6_LOCAL_NH6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, + [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, + [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, +}; + +static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + int len; + + srh = nla_data(attrs[SEG6_LOCAL_SRH]); + len = nla_len(attrs[SEG6_LOCAL_SRH]); + + /* SRH must contain at least one segment */ + if (len < sizeof(*srh) + sizeof(struct in6_addr)) + return -EINVAL; + + if (!seg6_validate_srh(srh, len)) + return -EINVAL; + + slwt->srh = kmalloc(len, GFP_KERNEL); + if (!slwt->srh) + return -ENOMEM; + + memcpy(slwt->srh, srh, len); + + slwt->headroom += len; + + return 0; +} + +static int put_nla_srh(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + struct nlattr *nla; + int len; + + srh = slwt->srh; + len = (srh->hdrlen + 1) << 3; + + nla = nla_reserve(skb, SEG6_LOCAL_SRH, len); + if (!nla) + return -EMSGSIZE; + + memcpy(nla_data(nla), srh, len); + + return 0; +} + +static int cmp_nla_srh(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + int len = (a->srh->hdrlen + 1) << 3; + + if (len != ((b->srh->hdrlen + 1) << 3)) + return 1; + + return memcmp(a->srh, b->srh, len); +} + +static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]); + + return 0; +} + +static int put_nla_table(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + if (nla_put_u32(skb, SEG6_LOCAL_TABLE, slwt->table)) + return -EMSGSIZE; + + return 0; +} + +static int cmp_nla_table(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + if (a->table != b->table) + return 1; + + return 0; +} + +static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]), + sizeof(struct in_addr)); + + return 0; +} + +static int put_nla_nh4(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct nlattr *nla; + + nla = nla_reserve(skb, SEG6_LOCAL_NH4, sizeof(struct in_addr)); + if (!nla) + return -EMSGSIZE; + + memcpy(nla_data(nla), &slwt->nh4, sizeof(struct in_addr)); + + return 0; +} + +static int cmp_nla_nh4(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + return memcmp(&a->nh4, &b->nh4, sizeof(struct in_addr)); +} + +static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + memcpy(&slwt->nh6, nla_data(attrs[SEG6_LOCAL_NH6]), + sizeof(struct in6_addr)); + + return 0; +} + +static int put_nla_nh6(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct nlattr *nla; + + nla = nla_reserve(skb, SEG6_LOCAL_NH6, sizeof(struct in6_addr)); + if (!nla) + return -EMSGSIZE; + + memcpy(nla_data(nla), &slwt->nh6, sizeof(struct in6_addr)); + + return 0; +} + +static int cmp_nla_nh6(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + return memcmp(&a->nh6, &b->nh6, sizeof(struct in6_addr)); +} + +static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + slwt->iif = nla_get_u32(attrs[SEG6_LOCAL_IIF]); + + return 0; +} + +static int put_nla_iif(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + if (nla_put_u32(skb, SEG6_LOCAL_IIF, slwt->iif)) + return -EMSGSIZE; + + return 0; +} + +static int cmp_nla_iif(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + if (a->iif != b->iif) + return 1; + + return 0; +} + +static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + slwt->oif = nla_get_u32(attrs[SEG6_LOCAL_OIF]); + + return 0; +} + +static int put_nla_oif(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + if (nla_put_u32(skb, SEG6_LOCAL_OIF, slwt->oif)) + return -EMSGSIZE; + + return 0; +} + +static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + if (a->oif != b->oif) + return 1; + + return 0; +} + +struct seg6_action_param { + int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); + int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); + int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b); +}; + +static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { + [SEG6_LOCAL_SRH] = { .parse = parse_nla_srh, + .put = put_nla_srh, + .cmp = cmp_nla_srh }, + + [SEG6_LOCAL_TABLE] = { .parse = parse_nla_table, + .put = put_nla_table, + .cmp = cmp_nla_table }, + + [SEG6_LOCAL_NH4] = { .parse = parse_nla_nh4, + .put = put_nla_nh4, + .cmp = cmp_nla_nh4 }, + + [SEG6_LOCAL_NH6] = { .parse = parse_nla_nh6, + .put = put_nla_nh6, + .cmp = cmp_nla_nh6 }, + + [SEG6_LOCAL_IIF] = { .parse = parse_nla_iif, + .put = put_nla_iif, + .cmp = cmp_nla_iif }, + + [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif, + .put = put_nla_oif, + .cmp = cmp_nla_oif }, +}; + +static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + struct seg6_action_param *param; + struct seg6_action_desc *desc; + int i, err; + + desc = __get_action_desc(slwt->action); + if (!desc) + return -EINVAL; + + if (!desc->input) + return -EOPNOTSUPP; + + slwt->desc = desc; + slwt->headroom += desc->static_headroom; + + for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + if (desc->attrs & (1 << i)) { + if (!attrs[i]) + return -EINVAL; + + param = &seg6_action_params[i]; + + err = param->parse(attrs, slwt); + if (err < 0) + return err; + } + } + + return 0; +} + +static int seg6_local_build_state(struct nlattr *nla, unsigned int family, + const void *cfg, struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[SEG6_LOCAL_MAX + 1]; + struct lwtunnel_state *newts; + struct seg6_local_lwt *slwt; + int err; + + if (family != AF_INET6) + return -EINVAL; + + err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy, + extack); + + if (err < 0) + return err; + + if (!tb[SEG6_LOCAL_ACTION]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*slwt)); + if (!newts) + return -ENOMEM; + + slwt = seg6_local_lwtunnel(newts); + slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]); + + err = parse_nla_action(tb, slwt); + if (err < 0) + goto out_free; + + newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL; + newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT; + newts->headroom = slwt->headroom; + + *ts = newts; + + return 0; + +out_free: + kfree(slwt->srh); + kfree(newts); + return err; +} + +static void seg6_local_destroy_state(struct lwtunnel_state *lwt) +{ + struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); + + kfree(slwt->srh); +} + +static int seg6_local_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwt) +{ + struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); + struct seg6_action_param *param; + int i, err; + + if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action)) + return -EMSGSIZE; + + for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + if (slwt->desc->attrs & (1 << i)) { + param = &seg6_action_params[i]; + err = param->put(skb, slwt); + if (err < 0) + return err; + } + } + + return 0; +} + +static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) +{ + struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); + unsigned long attrs; + int nlsize; + + nlsize = nla_total_size(4); /* action */ + + attrs = slwt->desc->attrs; + + if (attrs & (1 << SEG6_LOCAL_SRH)) + nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3); + + if (attrs & (1 << SEG6_LOCAL_TABLE)) + nlsize += nla_total_size(4); + + if (attrs & (1 << SEG6_LOCAL_NH4)) + nlsize += nla_total_size(4); + + if (attrs & (1 << SEG6_LOCAL_NH6)) + nlsize += nla_total_size(16); + + if (attrs & (1 << SEG6_LOCAL_IIF)) + nlsize += nla_total_size(4); + + if (attrs & (1 << SEG6_LOCAL_OIF)) + nlsize += nla_total_size(4); + + return nlsize; +} + +static int seg6_local_cmp_encap(struct lwtunnel_state *a, + struct lwtunnel_state *b) +{ + struct seg6_local_lwt *slwt_a, *slwt_b; + struct seg6_action_param *param; + int i; + + slwt_a = seg6_local_lwtunnel(a); + slwt_b = seg6_local_lwtunnel(b); + + if (slwt_a->action != slwt_b->action) + return 1; + + if (slwt_a->desc->attrs != slwt_b->desc->attrs) + return 1; + + for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + if (slwt_a->desc->attrs & (1 << i)) { + param = &seg6_action_params[i]; + if (param->cmp(slwt_a, slwt_b)) + return 1; + } + } + + return 0; +} + +static const struct lwtunnel_encap_ops seg6_local_ops = { + .build_state = seg6_local_build_state, + .destroy_state = seg6_local_destroy_state, + .input = seg6_local_input, + .fill_encap = seg6_local_fill_encap, + .get_encap_size = seg6_local_get_encap_size, + .cmp_encap = seg6_local_cmp_encap, + .owner = THIS_MODULE, +}; + +int __init seg6_local_init(void) +{ + return lwtunnel_encap_add_ops(&seg6_local_ops, + LWTUNNEL_ENCAP_SEG6_LOCAL); +} + +void seg6_local_exit(void) +{ + lwtunnel_encap_del_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL); +} diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 69c50e737c54..6fbf8ae5e52c 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -90,6 +90,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "flowlabel_reflect", + .data = &init_net.ipv6.sysctl.flowlabel_reflect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -149,6 +156,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay; ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind; + ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 206210125fd7..64d94afa427f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -350,7 +350,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __inet6_lookup_established(net, &tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), - skb->dev->ifindex); + skb->dev->ifindex, inet6_sdif(skb)); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), @@ -918,7 +918,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, - ntohs(th->source), tcp_v6_iif(skb)); + ntohs(th->source), tcp_v6_iif(skb), + tcp_v6_sdif(skb)); if (!sk1) goto out; @@ -1296,7 +1297,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + tcp_rcv_established(sk, skb, tcp_hdr(skb)); if (opt_skb) goto ipv6_pktoptions; return 0; @@ -1393,10 +1394,13 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } static int tcp_v6_rcv(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; bool refcounted; @@ -1430,7 +1434,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) lookup: sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), - th->source, th->dest, inet6_iif(skb), + th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -1456,9 +1460,9 @@ process: } sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; @@ -1505,8 +1509,7 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); + ret = tcp_v6_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } @@ -1564,7 +1567,8 @@ do_time_wait: skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, - ntohs(th->dest), tcp_v6_iif(skb)); + ntohs(th->dest), tcp_v6_iif(skb), + sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule_put(tw); @@ -1611,7 +1615,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), - inet6_iif(skb)); + inet6_iif(skb), inet6_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; @@ -1945,6 +1949,9 @@ struct proto tcpv6_prot = { .diag_destroy = tcp_abort, }; +/* thinking of making this const? Don't. + * early_demux can change based on sysctl. + */ static struct inet6_protocol tcpv6_protocol = { .early_demux = tcp_v6_early_demux, .early_demux_handler = tcp_v6_early_demux, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 56030d45823a..40d7234c27b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -129,7 +129,7 @@ static void udp_v6_rehash(struct sock *sk) static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, - int dif, bool exact_dif) + int dif, int sdif, bool exact_dif) { int score; struct inet_sock *inet; @@ -161,9 +161,13 @@ static int compute_score(struct sock *sk, struct net *net, } if (sk->sk_bound_dev_if || exact_dif) { - if (sk->sk_bound_dev_if != dif) + bool dev_match = (sk->sk_bound_dev_if == dif || + sk->sk_bound_dev_if == sdif); + + if (exact_dif && !dev_match) return -1; - score++; + if (sk->sk_bound_dev_if && dev_match) + score++; } if (sk->sk_incoming_cpu == raw_smp_processor_id()) @@ -175,9 +179,9 @@ static int compute_score(struct sock *sk, struct net *net, /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, unsigned int hnum, int dif, - bool exact_dif, struct udp_hslot *hslot2, - struct sk_buff *skb) + const struct in6_addr *daddr, unsigned int hnum, + int dif, int sdif, bool exact_dif, + struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; int score, badness, matches = 0, reuseport = 0; @@ -187,7 +191,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, exact_dif); + daddr, hnum, dif, sdif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -214,10 +218,10 @@ static struct sock *udp6_lib_lookup2(struct net *net, /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, __be16 dport, - int dif, struct udp_table *udptable, - struct sk_buff *skb) + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, int sdif, struct udp_table *udptable, + struct sk_buff *skb) { struct sock *sk, *result; unsigned short hnum = ntohs(dport); @@ -235,7 +239,7 @@ struct sock *__udp6_lib_lookup(struct net *net, goto begin; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, exact_dif, + daddr, hnum, dif, sdif, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; @@ -250,7 +254,7 @@ struct sock *__udp6_lib_lookup(struct net *net, goto begin; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, + daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } @@ -261,7 +265,7 @@ begin: badness = -1; sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, - exact_dif); + sdif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -294,7 +298,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - udptable, skb); + inet6_sdif(skb), udptable, skb); } struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, @@ -304,7 +308,7 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - &udp_table, skb); + inet6_sdif(skb), &udp_table, skb); } EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb); @@ -320,7 +324,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be struct sock *sk; sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, - dif, &udp_table, NULL); + dif, 0, &udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -502,7 +506,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, - inet6_iif(skb), udptable, skb); + inet6_iif(skb), 0, udptable, skb); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); @@ -902,7 +906,7 @@ discard: static struct sock *__udp6_lib_demux_lookup(struct net *net, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif) + int dif, int sdif) { unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); @@ -913,7 +917,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (sk->sk_state == TCP_ESTABLISHED && - INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif)) + INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; @@ -928,6 +932,7 @@ static void udp_v6_early_demux(struct sk_buff *skb) struct sock *sk; struct dst_entry *dst; int dif = skb->dev->ifindex; + int sdif = inet6_sdif(skb); if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) @@ -939,7 +944,7 @@ static void udp_v6_early_demux(struct sk_buff *skb) sk = __udp6_lib_demux_lookup(net, uh->dest, &ipv6_hdr(skb)->daddr, uh->source, &ipv6_hdr(skb)->saddr, - dif); + dif, sdif); else return; @@ -1010,6 +1015,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; @@ -1475,6 +1481,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif +/* thinking of making this const? Don't. + * early_demux can change based on sysctl. + */ static struct inet6_protocol udpv6_protocol = { .early_demux = udp_v6_early_demux, .early_demux_handler = udp_v6_early_demux, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e7d378c032cb..455fd4e39333 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -17,109 +17,15 @@ #include <net/ip6_checksum.h> #include "ip6_offload.h" -static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int mss; - unsigned int unfrag_ip6hlen, unfrag_len; - struct frag_hdr *fptr; - u8 *packet_start, *prevhdr; - u8 nexthdr; - u8 frag_hdr_sz = sizeof(struct frag_hdr); - __wsum csum; - int tnl_hlen; - int err; - - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - /* Set the IPv6 fragment id if not set yet */ - if (!skb_shinfo(skb)->ip6_frag_id) - ipv6_proxy_select_ident(dev_net(skb->dev), skb); - - segs = NULL; - goto out; - } if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features, true); - else { - const struct ipv6hdr *ipv6h; - struct udphdr *uh; - - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - goto out; - - /* Do software UFO. Complete and fill in the UDP checksum as HW cannot - * do checksum of UDP packets sent as multiple IP fragments. - */ - - uh = udp_hdr(skb); - ipv6h = ipv6_hdr(skb); - - uh->check = 0; - csum = skb_checksum(skb, 0, skb->len, 0); - uh->check = udp_v6_check(skb->len, &ipv6h->saddr, - &ipv6h->daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; - - /* If there is no outer header we can fake a checksum offload - * due to the fact that we have already done the checksum in - * software prior to segmenting the frame. - */ - if (!skb->encap_hdr_csum) - features |= NETIF_F_HW_CSUM; - - /* Check if there is enough headroom to insert fragment header. */ - tnl_hlen = skb_tnl_header_len(skb); - if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { - if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) - goto out; - } - - /* Find the unfragmentable header and shift it left by frag_hdr_sz - * bytes to insert fragment header. - */ - err = ip6_find_1stfragopt(skb, &prevhdr); - if (err < 0) - return ERR_PTR(err); - unfrag_ip6hlen = err; - nexthdr = *prevhdr; - *prevhdr = NEXTHDR_FRAGMENT; - unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + - unfrag_ip6hlen + tnl_hlen; - packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; - memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); - - SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; - skb->mac_header -= frag_hdr_sz; - skb->network_header -= frag_hdr_sz; - - fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); - fptr->nexthdr = nexthdr; - fptr->reserved = 0; - if (!skb_shinfo(skb)->ip6_frag_id) - ipv6_proxy_select_ident(dev_net(skb->dev), skb); - fptr->identification = skb_shinfo(skb)->ip6_frag_id; - - /* Fragment the skb. ipv6 header and the remaining fields of the - * fragment header are updated in ipv6_gso_segment() - */ - segs = skb_segment(skb, features); - } -out: return segs; } @@ -169,7 +75,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv6_offload = { .callbacks = { - .gso_segment = udp6_ufo_fragment, + .gso_segment = udp6_tunnel_segment, .gro_receive = udp6_gro_receive, .gro_complete = udp6_gro_complete, }, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 3ef5d913e7a3..f95943a13abc 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -34,6 +34,7 @@ EXPORT_SYMBOL(xfrm6_rcv_spi); int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); + int nhlen = skb->data - skb_network_header(skb); skb_network_header(skb)[IP6CB(skb)->nhoff] = XFRM_MODE_SKB_CB(skb)->protocol; @@ -43,8 +44,9 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return 1; #endif - __skb_push(skb, skb->data - skb_network_header(skb)); + __skb_push(skb, nhlen); ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_postpush_rcsum(skb, skb_network_header(skb), nhlen); if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 79651bc71bf0..11d1314ab6c5 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -27,7 +27,8 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct flowi6 fl6; struct dst_entry *dst; @@ -36,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif); fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; + fl6.flowi6_mark = mark; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); @@ -52,12 +54,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, } static int xfrm6_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr) + xfrm_address_t *saddr, xfrm_address_t *daddr, + u32 mark) { struct dst_entry *dst; struct net_device *dev; - dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr); + dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; @@ -214,14 +217,6 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) } } -static inline int xfrm6_garbage_collect(struct dst_ops *ops) -{ - struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); - - xfrm_garbage_collect_deferred(net); - return dst_entries_get_fast(ops) > ops->gc_thresh * 2; -} - static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { @@ -279,14 +274,13 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, - .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, .redirect = xfrm6_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, - .gc_thresh = INT_MAX, + .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { |