diff options
Diffstat (limited to 'net/ipv6')
52 files changed, 2043 insertions, 678 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f49bd7897e95..e1846b97ee69 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -186,7 +186,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, static void addrconf_dad_start(struct inet6_ifaddr *ifp); static void addrconf_dad_work(struct work_struct *w); -static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id); +static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, + bool send_na); static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(struct timer_list *t); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -3438,6 +3439,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } else if (event == NETDEV_CHANGE) { if (!addrconf_link_ready(dev)) { /* device is still not ready. */ + rt6_sync_down_dev(dev, event); break; } @@ -3449,6 +3451,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * multicast snooping switches */ ipv6_mc_up(idev); + rt6_sync_up(dev, RTNH_F_LINKDOWN); break; } idev->if_flags |= IF_READY; @@ -3484,6 +3487,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (run_pending) addrconf_dad_run(idev); + /* Device has an address by now */ + rt6_sync_up(dev, RTNH_F_DEAD); + /* * If the MTU changed during the interface down, * when the interface up, the changed MTU must be @@ -3577,6 +3583,7 @@ static bool addr_is_local(const struct in6_addr *addr) static int addrconf_ifdown(struct net_device *dev, int how) { + unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN; struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; @@ -3586,8 +3593,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) ASSERT_RTNL(); - rt6_ifdown(net, dev); - neigh_ifdown(&nd_tbl, dev); + rt6_disable_ip(dev, event); idev = __in6_dev_get(dev); if (!idev) @@ -3833,12 +3839,17 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { + bool send_na = false; + + if (ifp->flags & IFA_F_TENTATIVE && + !(ifp->flags & IFA_F_OPTIMISTIC)) + send_na = true; bump_id = ifp->flags & IFA_F_TENTATIVE; ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); - addrconf_dad_completed(ifp, bump_id); + addrconf_dad_completed(ifp, bump_id, send_na); return; } @@ -3967,16 +3978,21 @@ static void addrconf_dad_work(struct work_struct *w) } if (ifp->dad_probes == 0) { + bool send_na = false; + /* * DAD was successful */ + if (ifp->flags & IFA_F_TENTATIVE && + !(ifp->flags & IFA_F_OPTIMISTIC)) + send_na = true; bump_id = ifp->flags & IFA_F_TENTATIVE; ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock(&ifp->lock); write_unlock_bh(&idev->lock); - addrconf_dad_completed(ifp, bump_id); + addrconf_dad_completed(ifp, bump_id, send_na); goto out; } @@ -4014,7 +4030,8 @@ static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp) return true; } -static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id) +static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, + bool send_na) { struct net_device *dev = ifp->idev->dev; struct in6_addr lladdr; @@ -4046,6 +4063,16 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id) if (send_mld) ipv6_mc_dad_complete(ifp->idev); + /* send unsolicited NA if enabled */ + if (send_na && + (ifp->idev->cnf.ndisc_notify || + dev_net(dev)->ipv6.devconf_all->ndisc_notify)) { + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr, + /*router=*/ !!ifp->idev->cnf.forwarding, + /*solicited=*/ false, /*override=*/ true, + /*inc_opt=*/ true); + } + if (send_rs) { /* * If a host as already performed a random delay @@ -4209,7 +4236,6 @@ static int if6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations if6_fops = { - .owner = THIS_MODULE, .open = if6_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -4352,9 +4378,11 @@ restart: spin_lock(&ifpub->lock); ifpub->regen_count = 0; spin_unlock(&ifpub->lock); + rcu_read_unlock_bh(); ipv6_create_tempaddr(ifpub, ifp, true); in6_ifa_put(ifpub); in6_ifa_put(ifp); + rcu_read_lock_bh(); goto restart; } } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) @@ -6595,27 +6623,45 @@ int __init addrconf_init(void) rtnl_af_register(&inet6_ops); - err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, - 0); + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK, + NULL, inet6_dump_ifinfo, 0); if (err < 0) goto errout; - /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0); - __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0); - __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, - inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, - inet6_dump_ifmcaddr, 0); - __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, - inet6_dump_ifacaddr, 0); - __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED); - - ipv6_addr_label_rtnl_register(); + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR, + inet6_rtm_newaddr, NULL, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR, + inet6_rtm_deladdr, NULL, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR, + inet6_rtm_getaddr, inet6_dump_ifaddr, + RTNL_FLAG_DOIT_UNLOCKED); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST, + NULL, inet6_dump_ifmcaddr, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST, + NULL, inet6_dump_ifacaddr, 0); + if (err < 0) + goto errout; + err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF, + inet6_netconf_get_devconf, + inet6_netconf_dump_devconf, + RTNL_FLAG_DOIT_UNLOCKED); + if (err < 0) + goto errout; + err = ipv6_addr_label_rtnl_register(); + if (err < 0) + goto errout; return 0; errout: + rtnl_unregister_all(PF_INET6); rtnl_af_unregister(&inet6_ops); unregister_netdevice_notifier(&ipv6_dev_notf); errlo: diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 00e1f8ee08f8..1d6ced37ad71 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -547,13 +547,22 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return err; } -void __init ipv6_addr_label_rtnl_register(void) +int __init ipv6_addr_label_rtnl_register(void) { - __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); -} + int ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL, + ip6addrlbl_newdel, + NULL, RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) + return ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL, + ip6addrlbl_newdel, + NULL, RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) + return ret; + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, + ip6addrlbl_get, + ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); + return ret; +} diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c26f71234b9c..416917719a6f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -210,7 +210,6 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->autoflowlabel = ip6_default_np_autolabel(net); np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; @@ -285,6 +284,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); __be32 v4addr = 0; unsigned short snum; + bool saved_ipv6only; int addr_type = 0; int err = 0; @@ -390,19 +390,21 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; + saved_ipv6only = sk->sk_ipv6only; + if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED) + sk->sk_ipv6only = 1; + /* Make sure we are allowed to bind here. */ if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); err = -EADDRINUSE; goto out; } - if (addr_type != IPV6_ADDR_ANY) { + if (addr_type != IPV6_ADDR_ANY) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; - if (addr_type != IPV6_ADDR_MAPPED) - sk->sk_ipv6only = 1; - } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 0bbab8a4b5d8..8e085cc05aeb 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -533,7 +533,6 @@ static int ac6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ac6_seq_fops = { - .owner = THIS_MODULE, .open = ac6_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index a1f918713006..fbf08ce3f5ab 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -221,8 +221,7 @@ ipv4_connected: if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) { + if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) { err = -EINVAL; goto out; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a902ff8f59be..97513f35bcc5 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -141,14 +141,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x; + + if (xo && (xo->flags & XFRM_DEV_RESUME)) + x = skb->sp->xvec[skb->sp->len - 1]; + else + x = skb_dst(skb)->xfrm; tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); kfree(tmp); - xfrm_output_resume(skb, err); + + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + if (err) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); + kfree_skb(skb); + return; + } + + skb_push(skb, skb->data - skb_mac_header(skb)); + secpath_reset(skb); + xfrm_dev_resume(skb); + } else { + xfrm_output_resume(skb, err); + } } /* Move ESP header back into place. */ @@ -734,17 +752,13 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(aead_name, 0, mask); + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -774,7 +788,6 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; - u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -800,10 +813,7 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(authenc_name, 0, mask); + aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -890,13 +900,12 @@ static int esp6_init_state(struct xfrm_state *x) x->props.header_len += IPV4_BEET_PHMAXLEN + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); break; + default: case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct ipv6hdr); break; - default: - goto error; } align = ALIGN(crypto_aead_blocksize(aead), 4); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 333a478aa161..3fd1ec775dc2 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -60,7 +60,8 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head, int nhoff; int err; - skb_pull(skb, offset); + if (!pskb_pull(skb, offset)) + return NULL; if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; @@ -135,75 +136,39 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { - __u32 seq; - int err = 0; - struct sk_buff *skb2; struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) - goto out; + return ERR_PTR(-EINVAL); - seq = xo->seq.low; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) + return ERR_PTR(-EINVAL); x = skb->sp->xvec[skb->sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) - goto out; + return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) - goto out; + return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP)) + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - segs = x->outer_mode->gso_segment(x, skb, esp_features); - if (IS_ERR_OR_NULL(segs)) - goto out; - - __skb_pull(skb, skb->data - skb_mac_header(skb)); + xo->flags |= XFRM_GSO_SEGMENT; - skb2 = segs; - do { - struct sk_buff *nskb = skb2->next; - - xo = xfrm_offload(skb2); - xo->flags |= XFRM_GSO_SEGMENT; - xo->seq.low = seq; - xo->seq.hi = xfrm_replay_seqhi(x, seq); - - if(!(features & NETIF_F_HW_ESP)) - xo->flags |= CRYPTO_FALLBACK; - - x->outer_mode->xmit(x, skb2); - - err = x->type_offload->xmit(x, skb2, esp_features); - if (err) { - kfree_skb_list(segs); - return ERR_PTR(err); - } - - if (!skb_is_gso(skb2)) - seq++; - else - seq += skb_shinfo(skb2)->gso_segs; - - skb_push(skb2, skb2->mac_len); - skb2 = nskb; - } while (skb2); - -out: - return segs; + return x->outer_mode->gso_segment(x, skb, esp_features); } static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -222,6 +187,7 @@ static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { + int len; int err; int alen; int blksize; @@ -230,6 +196,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; + __u32 seq; esp.inplace = true; @@ -265,28 +232,33 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features return esp.nfrags; } + seq = xo->seq.low; + esph = ip_esp_hdr(skb); esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(xo->seq.low); - } else { - int len; - - len = skb->len - sizeof(struct ipv6hdr); - if (len > IPV6_MAXPLEN) - len = 0; + esph->seq_no = htonl(seq); - ipv6_hdr(skb)->payload_len = htons(len); + if (!skb_is_gso(skb)) + xo->seq.low++; + else + xo->seq.low += skb_shinfo(skb)->gso_segs; } + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + + ipv6_hdr(skb)->payload_len = htons(len); + if (hw_offload) return 0; - esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); - err = esp6_output_tail(x, skb, &esp); if (err) return err; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 83bd75713535..bc68eb661970 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -925,6 +925,15 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; + if (sr_ihdr->hdrlen > hops * 2) { + int tlvs_offset, tlvs_length; + + tlvs_offset = (1 + hops * 2) << 3; + tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; + memcpy((char *)sr_phdr + tlvs_offset, + (char *)sr_ihdr + tlvs_offset, tlvs_length); + } + #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 6eb5e68f112a..44c39c5f0638 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -512,9 +512,7 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) struct ila_map *ila; int ret; - ret = rhashtable_walk_start(rhiter); - if (ret && ret != -EAGAIN) - goto done; + rhashtable_walk_start(rhiter); for (;;) { ila = rhashtable_walk_next(rhiter); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b01858f5deb1..2febe26de6a1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net, } /* called with rcu_read_lock() */ +static struct sock *inet6_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, const struct in6_addr *daddr, + const unsigned short hnum, const int dif, const int sdif) +{ + bool exact_dif = inet6_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, dif, sdif, + exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -134,31 +168,56 @@ struct sock *inet6_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet6_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv6_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with in6addr_any */ + + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f5285f4e1d08..92b8d8c75eed 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -107,16 +107,13 @@ enum { void fib6_update_sernum(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; - spin_lock_bh(&table->tb6_lock); fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&table->tb6_lock)); + lockdep_is_held(&rt->rt6i_table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); - spin_unlock_bh(&table->tb6_lock); } /* @@ -640,6 +637,11 @@ static struct fib6_node *fib6_add_1(struct net *net, if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); rt6_release(leaf); + /* remove null_entry in the root node */ + } else if (fn->fn_flags & RTN_TL_ROOT && + rcu_access_pointer(fn->leaf) == + net->ipv6.ip6_null_entry) { + RCU_INIT_POINTER(fn->leaf, NULL); } return fn; @@ -799,12 +801,6 @@ insert_above: return ln; } -static bool rt6_qualify_for_ecmp(struct rt6_info *rt) -{ - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) { int i; @@ -893,7 +889,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; for (iter = leaf; iter; - iter = rcu_dereference_protected(iter->dst.rt6_next, + iter = rcu_dereference_protected(iter->rt6_next, lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates @@ -950,7 +946,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, break; next_iter: - ins = &iter->dst.rt6_next; + ins = &iter->rt6_next; } if (fallback_ins && !found) { @@ -979,7 +975,7 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = rcu_dereference_protected(sibling->dst.rt6_next, + sibling = rcu_dereference_protected(sibling->rt6_next, lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of @@ -994,6 +990,7 @@ next_iter: rt6i_nsiblings++; } BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1009,7 +1006,7 @@ add: if (err) return err; - rcu_assign_pointer(rt->dst.rt6_next, iter); + rcu_assign_pointer(rt->rt6_next, iter); atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); @@ -1040,7 +1037,7 @@ add: atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); - rt->dst.rt6_next = iter->dst.rt6_next; + rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); @@ -1059,14 +1056,14 @@ add: if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - ins = &rt->dst.rt6_next; + ins = &rt->rt6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->dst.rt6_next; + *ins = iter->rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) @@ -1075,7 +1072,7 @@ add: nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { - ins = &iter->dst.rt6_next; + ins = &iter->rt6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1102,8 +1099,8 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } -static void fib6_update_sernum_upto_root(struct rt6_info *rt, - int sernum) +static void __fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1117,6 +1114,11 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, } } +void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) +{ + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1221,8 +1223,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } if (!rcu_access_pointer(fn->leaf)) { - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(fn->leaf, rt); + if (fn->fn_flags & RTN_TL_ROOT) { + /* put back null_entry for root node */ + rcu_assign_pointer(fn->leaf, + info->nl_net->ipv6.ip6_null_entry); + } else { + atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); + } } fn = sn; } @@ -1230,7 +1238,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, err = fib6_add_rt2node(fn, rt, info, mxc, extack); if (!err) { - fib6_update_sernum_upto_root(rt, sernum); + __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } @@ -1241,23 +1249,28 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, - lockdep_is_held(&table->tb6_lock)); - if (pn != fn && pn_leaf == rt) { - pn_leaf = NULL; - RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); - } - if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); - pn_leaf = info->nl_net->ipv6.ip6_null_entry; + if (pn != fn) { + struct rt6_info *pn_leaf = + rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + atomic_dec(&rt->rt6i_ref); } + if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, + pn); +#if RT6_DEBUG >= 2 + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = + info->nl_net->ipv6.ip6_null_entry; + } #endif - atomic_inc(&pn_leaf->rt6i_ref); - rcu_assign_pointer(pn->leaf, pn_leaf); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); + } } #endif goto failure; @@ -1265,13 +1278,17 @@ out: return err; failure: - /* fn->leaf could be NULL if fn is an intermediate node and we - * failed to add the new route to it in both subtree creation - * failure and fib6_add_rt2node() failure case. - * In both cases, fib6_repair_tree() should be called to fix - * fn->leaf. + /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: + * 1. fn is an intermediate node and we failed to add the new + * route to it in both subtree creation failure and fib6_add_rt2node() + * failure case. + * 2. fn is the root node in the table and we fail to add the first + * default route to it. */ - if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + if (fn && + (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || + (fn->fn_flags & RTN_TL_ROOT && + !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function @@ -1526,6 +1543,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_walker *w; int iter = 0; + /* Set fn->leaf to null_entry for root node. */ + if (fn->fn_flags & RTN_TL_ROOT) { + rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + return fn; + } + for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); @@ -1644,7 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); /* Unlink it */ - *rtp = rt->dst.rt6_next; + *rtp = rt->rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1665,6 +1688,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, sibling->rt6i_nsiblings--; rt->rt6i_nsiblings = 0; list_del_init(&rt->rt6i_siblings); + rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ @@ -1672,7 +1696,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + w->leaf = rcu_dereference_protected(rt->rt6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; @@ -1680,10 +1704,15 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, } read_unlock(&net->ipv6.fib6_walker_lock); - /* If it was last route, expunge its radix tree node */ + /* If it was last route, call fib6_repair_tree() to: + * 1. For root node, put back null_entry as how the table was created. + * 2. For other nodes, expunge its radix tree node. + */ if (!rcu_access_pointer(fn->leaf)) { - fn->fn_flags &= ~RTN_RTINFO; - net->ipv6.rt6_stats->fib_route_nodes--; + if (!(fn->fn_flags & RTN_TL_ROOT)) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + } fn = fib6_repair_tree(net, table, fn); } @@ -1731,7 +1760,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) fib6_del_route(table, fn, rtp, info); return 0; } - rtp_next = &cur->dst.rt6_next; + rtp_next = &cur->rt6_next; } return -ENOENT; } @@ -1887,7 +1916,7 @@ static int fib6_clean_node(struct fib6_walker *w) for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); - if (res < 0) { + if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { @@ -1900,6 +1929,12 @@ static int fib6_clean_node(struct fib6_walker *w) continue; } return 0; + } else if (res == -2) { + if (WARN_ON(!rt->rt6i_nsiblings)) + continue; + rt = list_last_entry(&rt->rt6i_siblings, + struct rt6_info, rt6i_siblings); + continue; } WARN_ON(res != 0); } @@ -1911,7 +1946,8 @@ static int fib6_clean_node(struct fib6_walker *w) * Convenient frontend to tree walker. * * func is called on each route. - * It may return -1 -> delete this route. + * It may return -2 -> skip multipath route. + * -1 -> delete this route. * 0 -> continue walking */ @@ -2103,7 +2139,6 @@ static void fib6_net_exit(struct net *net) { unsigned int i; - rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { @@ -2142,8 +2177,8 @@ int __init fib6_init(void) if (ret) goto out_kmem_cache_create; - ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, - 0); + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, + inet6_dump_fib, 0); if (ret) goto out_unregister_subsys; @@ -2208,7 +2243,7 @@ static int ipv6_route_yield(struct fib6_walker *w) do { iter->w.leaf = rcu_dereference_protected( - iter->w.leaf->dst.rt6_next, + iter->w.leaf->rt6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) @@ -2274,7 +2309,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); + n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next); if (n) { ++*pos; return n; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 7f59c8fabeeb..3dab664ff503 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -836,7 +836,6 @@ static int ip6fl_seq_release(struct inode *inode, struct file *file) } static const struct file_operations ip6fl_seq_fops = { - .owner = THIS_MODULE, .open = ip6fl_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4cfd8e0696fe..05f070e123e4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -55,6 +55,8 @@ #include <net/ip6_route.h> #include <net/ip6_tunnel.h> #include <net/gre.h> +#include <net/erspan.h> +#include <net/dst_metadata.h> static bool log_ecn_error = true; @@ -68,11 +70,13 @@ static unsigned int ip6gre_net_id __read_mostly; struct ip6gre_net { struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; + struct ip6_tnl __rcu *collect_md_tun; struct net_device *fb_tunnel_dev; }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly; static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; +static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly; static int ip6gre_tunnel_init(struct net_device *dev); static void ip6gre_tunnel_setup(struct net_device *dev); static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); @@ -121,7 +125,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, unsigned int h1 = HASH_KEY(key); struct ip6_tnl *t, *cand = NULL; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + int dev_type = (gre_proto == htons(ETH_P_TEB) || + gre_proto == htons(ETH_P_ERSPAN)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; @@ -226,6 +231,10 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, if (cand) return cand; + t = rcu_dereference(ign->collect_md_tun); + if (t && t->dev->flags & IFF_UP) + return t; + dev = ign->fb_tunnel_dev; if (dev->flags & IFF_UP) return netdev_priv(dev); @@ -261,6 +270,9 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, t); + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } @@ -270,6 +282,9 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, NULL); + for (tp = ip6gre_bucket(ign, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { @@ -337,11 +352,12 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, 1); if (register_netdevice(dev) < 0) goto failed_free; + ip6gre_tnl_link_config(nt, 1); + /* Can use a lockless transmit, unless we generate output sequences */ if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; @@ -460,7 +476,101 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { - ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + __be64 tun_id; + __be16 flags; + + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return PACKET_REJECT; + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + } else { + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } + + return PACKET_RCVD; + } + + return PACKET_REJECT; +} + +static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, + struct tnl_ptk_info *tpi) +{ + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; + const struct ipv6hdr *ipv6h; + struct ip6_tnl *tunnel; + u8 ver; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + + ipv6h = ipv6_hdr(skb); + ershdr = (struct erspan_base_hdr *)skb->data; + ver = ershdr->ver; + tpi->key = cpu_to_be32(get_session_id(ershdr)); + + tunnel = ip6gre_tunnel_lookup(skb->dev, + &ipv6h->saddr, &ipv6h->daddr, tpi->key, + tpi->proto); + if (tunnel) { + int len = erspan_hdr_len(ver); + + if (unlikely(!pskb_may_pull(skb, len))) + return PACKET_REJECT; + + ershdr = (struct erspan_base_hdr *)skb->data; + pkt_md = (struct erspan_metadata *)(ershdr + 1); + + if (__iptunnel_pull_header(skb, len, + htons(ETH_P_TEB), + false, false) < 0) + return PACKET_REJECT; + + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + struct erspan_metadata *md; + __be64 tun_id; + __be16 flags; + + tpi->flags |= TUNNEL_KEY; + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, + sizeof(*md)); + if (!tun_dst) + return PACKET_REJECT; + + info = &tun_dst->u.tun_info; + md = ip_tunnel_info_opts(info); + + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + info->options_len = sizeof(*md); + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + + } else { + tunnel->parms.erspan_ver = ver; + + if (ver == 1) { + tunnel->parms.index = ntohl(pkt_md->u.index); + } else { + tunnel->parms.dir = pkt_md->u.md2.dir; + tunnel->parms.hwid = get_hwid(&pkt_md->u.md2); + } + + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } return PACKET_RCVD; } @@ -481,9 +591,17 @@ static int gre_rcv(struct sk_buff *skb) if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) goto drop; + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { + if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD) + return 0; + goto out; + } + if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) return 0; +out: icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -496,6 +614,78 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum) csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } +static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) +{ + const struct iphdr *iph = ip_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + *encap_limit = t->parms.encap_limit; + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv4_get_dsfield(iph); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); +} + +static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + __u16 offset; + + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + *encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + *encap_limit = t->parms.encap_limit; + } + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv6_get_dsfield(ipv6h); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6->flowlabel |= ip6_flowlabel(ipv6h); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + return 0; +} + static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, @@ -517,8 +707,38 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; - gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, - protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + + if (tunnel->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + __be16 flags; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = IPPROTO_GRE; + fl6->daddr = key->u.ipv6.dst; + fl6->flowlabel = key->label; + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + tunnel->tun_hlen = gre_calc_hlen(flags); + + gre_build_header(skb, tunnel->tun_hlen, + flags, protocol, + tunnel_id_to_key32(tun_info->key.tun_id), 0); + + } else { + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + protocol, tunnel->parms.o_key, + htonl(tunnel->o_seqno)); + } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); @@ -527,30 +747,17 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); int encap_limit = -1; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (!t->parms.collect_md) + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -574,46 +781,17 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; - __u16 offset; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err; if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) return -1; - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - ipv6h = ipv6_hdr(skb); - - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; - } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (!t->parms.collect_md && + prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) + return -1; if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) return -1; @@ -660,7 +838,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + if (!t->parms.collect_md) + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -705,6 +884,137 @@ tx_err: return NETDEV_TX_OK; } +static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + struct dst_entry *dst = skb_dst(skb); + struct net_device_stats *stats; + bool truncate = false; + int encap_limit = -1; + __u8 dsfield = false; + struct flowi6 fl6; + int err = -EINVAL; + __u32 mtu; + + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) + goto tx_err; + + if (gre_handle_offloads(skb, false)) + goto tx_err; + + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); + truncate = true; + } + + t->parms.o_flags &= ~TUNNEL_KEY; + IPCB(skb)->flags = 0; + + /* For collect_md mode, derive fl6 from the tunnel key, + * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}. + */ + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct erspan_metadata *md; + __be32 tun_id; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_GRE; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto tx_err; + + tun_id = tunnel_id_to_key32(key->tun_id); + if (md->version == 1) { + erspan_build_header(skb, + ntohl(tun_id), + ntohl(md->u.index), truncate, + false); + } else if (md->version == 2) { + erspan_build_header_v2(skb, + ntohl(tun_id), + md->u.md2.dir, + get_hwid(&md->u.md2), + truncate, false); + } + } else { + switch (skb->protocol) { + case htons(ETH_P_IP): + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); + break; + case htons(ETH_P_IPV6): + if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + goto tx_err; + if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, + &dsfield, &encap_limit)) + goto tx_err; + break; + default: + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + break; + } + + if (t->parms.erspan_ver == 1) + erspan_build_header(skb, ntohl(t->parms.o_key), + t->parms.index, + truncate, false); + else + erspan_build_header_v2(skb, ntohl(t->parms.o_key), + t->parms.dir, + t->parms.hwid, + truncate, false); + fl6.daddr = t->parms.raddr; + } + + /* Push GRE header. */ + gre_build_header(skb, 8, TUNNEL_SEQ, + htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++)); + + /* TooBig packet may have updated dst->dev's mtu */ + if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); + + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + NEXTHDR_GRE); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) { + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } + + goto tx_err; + } + return NETDEV_TX_OK; + +tx_err: + stats = &t->dev->stats; + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) { struct net_device *dev = t->dev; @@ -1014,6 +1324,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev) eth_random_addr(dev->perm_addr); } +#define GRE6_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM) + +static void ip6gre_tnl_init_features(struct net_device *dev) +{ + struct ip6_tnl *nt = netdev_priv(dev); + + dev->features |= GRE6_FEATURES; + dev->hw_features |= GRE6_FEATURES; + + if (!(nt->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(nt->parms.o_flags & TUNNEL_CSUM) || + nt->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } +} + static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1048,6 +1388,12 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; + if (tunnel->parms.collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } + ip6gre_tnl_init_features(dev); + return 0; } @@ -1062,6 +1408,9 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel = netdev_priv(dev); + if (tunnel->parms.collect_md) + return 0; + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); @@ -1084,7 +1433,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) dev_hold(dev); } - static struct inet6_protocol ip6gre_protocol __read_mostly = { .handler = gre_rcv, .err_handler = ip6gre_err, @@ -1099,7 +1447,8 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6gre_link_ops || - dev->rtnl_link_ops == &ip6gre_tap_ops) + dev->rtnl_link_ops == &ip6gre_tap_ops || + dev->rtnl_link_ops == &ip6erspan_tap_ops) unregister_netdevice_queue(dev, head); for (prio = 0; prio < 4; prio++) { @@ -1221,6 +1570,70 @@ out: return ip6gre_tunnel_validate(tb, data, extack); } +static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + __be16 flags = 0; + int ret, ver = 0; + + if (!data) + return 0; + + ret = ip6gre_tap_validate(tb, data, extack); + if (ret) + return ret; + + /* ERSPAN should only have GRE sequence and key flag */ + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (!data[IFLA_GRE_COLLECT_METADATA] && + flags != (GRE_SEQ | GRE_KEY)) + return -EINVAL; + + /* ERSPAN Session ID only has 10-bit. Since we reuse + * 32-bit key field as ID, check it's range. + */ + if (data[IFLA_GRE_IKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_OKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_ERSPAN_VER]) { + ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + if (ver != 1 && ver != 2) + return -EINVAL; + } + + if (ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + + if (index & ~INDEX_MASK) + return -EINVAL; + } + } else if (ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + + if (dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + + if (data[IFLA_GRE_ERSPAN_HWID]) { + u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + + if (hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + + return 0; +} static void ip6gre_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) @@ -1267,11 +1680,26 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + + if (data[IFLA_GRE_COLLECT_METADATA]) + parms->collect_md = true; + + if (data[IFLA_GRE_ERSPAN_VER]) + parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + + if (parms->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) + parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + } else if (parms->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) + parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (data[IFLA_GRE_ERSPAN_HWID]) + parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + } } static int ip6gre_tap_init(struct net_device *dev) { - struct ip6_tnl *tunnel; int ret; ret = ip6gre_tunnel_init_common(dev); @@ -1280,10 +1708,6 @@ static int ip6gre_tap_init(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tunnel = netdev_priv(dev); - - ip6gre_tnl_link_config(tunnel, 1); - return 0; } @@ -1298,16 +1722,65 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_get_iflink = ip6_tnl_get_iflink, }; -#define GRE6_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_HW_CSUM) +static int ip6erspan_tap_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int t_hlen; + int ret; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + + tunnel->tun_hlen = 8; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + erspan_hdr_len(tunnel->parms.erspan_ver); + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; + if (dev->type == ARPHRD_ETHER) + dev->mtu -= ETH_HLEN; + if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + tunnel = netdev_priv(dev); + ip6gre_tnl_link_config(tunnel, 1); + + return 0; +} + +static const struct net_device_ops ip6erspan_netdev_ops = { + .ndo_init = ip6erspan_tap_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6erspan_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip6_tnl_get_iflink, +}; static void ip6gre_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &ip6gre_tap_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; @@ -1372,40 +1845,29 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, ip6gre_netlink_parms(data, &nt->parms); - if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; + if (nt->parms.collect_md) { + if (rtnl_dereference(ign->collect_md_tun)) + return -EEXIST; + } else { + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + } if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - - dev->features |= GRE6_FEATURES; - dev->hw_features |= GRE6_FEATURES; - - if (!(nt->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported, nor - * can we support 2 levels of outer headers requiring - * an update. - */ - if (!(nt->parms.o_flags & TUNNEL_CSUM) || - (nt->encap.type == TUNNEL_ENCAP_NONE)) { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } - - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } err = register_netdevice(dev); if (err) goto out; + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); + + if (tb[IFLA_MTU]) + ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + dev_hold(dev); ip6gre_tunnel_link(ign, nt); @@ -1492,8 +1954,12 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_INDEX */ + nla_total_size(4) + 0; } @@ -1515,7 +1981,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || - nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) || + nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -1528,6 +1995,24 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (p->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) + goto nla_put_failure; + + if (p->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) + goto nla_put_failure; + } else if (p->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) + goto nla_put_failure; + } + return 0; nla_put_failure: @@ -1550,9 +2035,28 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; +static void ip6erspan_tap_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->netdev_ops = &ip6erspan_netdev_ops; + dev->needs_free_netdev = true; + dev->priv_destructor = ip6gre_dev_free; + + dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); +} + static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .kind = "ip6gre", .maxtype = IFLA_GRE_MAX, @@ -1582,6 +2086,20 @@ static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; +static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = { + .kind = "ip6erspan", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6erspan_tap_setup, + .validate = ip6erspan_tap_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, + .get_link_net = ip6_tnl_get_link_net, +}; + /* * And now the modules code and kernel interface. */ @@ -1610,9 +2128,15 @@ static int __init ip6gre_init(void) if (err < 0) goto tap_ops_failed; + err = rtnl_link_register(&ip6erspan_tap_ops); + if (err < 0) + goto erspan_link_failed; + out: return err; +erspan_link_failed: + rtnl_link_unregister(&ip6gre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ip6gre_link_ops); rtnl_link_failed: @@ -1626,6 +2150,7 @@ static void __exit ip6gre_fini(void) { rtnl_link_unregister(&ip6gre_tap_ops); rtnl_link_unregister(&ip6gre_link_ops); + rtnl_link_unregister(&ip6erspan_tap_ops); inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); unregister_pernet_device(&ip6gre_net_ops); } @@ -1637,4 +2162,5 @@ MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); MODULE_ALIAS_RTNL_LINK("ip6gretap"); +MODULE_ALIAS_RTNL_LINK("ip6erspan"); MODULE_ALIAS_NETDEV("ip6gre0"); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5110a418cc4d..997c7f19ad62 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -138,6 +138,14 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ret; } +#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) + /* Policy lookup after SNAT yielded a new policy */ + if (skb_dst(skb)->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(net, sk, skb); + } +#endif + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) @@ -166,6 +174,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +{ + if (!np->autoflowlabel_set) + return ip6_default_np_autolabel(net); + else + return np->autoflowlabel; +} + /* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified @@ -230,7 +246,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -362,7 +378,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) { unsigned int mtu; struct inet6_dev *idev; @@ -382,6 +398,7 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward); static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { @@ -1198,16 +1215,18 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } + if (mtu < IPV6_MIN_MTU) + return -EINVAL; cork->base.fragsize = mtu; - if (dst_allfrag(rt->dst.path)) + if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; @@ -1626,7 +1645,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; @@ -1725,11 +1744,13 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.flags = 0; cork.base.addr = 0; cork.base.opt = NULL; + cork.base.dst = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); - if (err) + if (err) { + ip6_cork_release(&cork, &v6_cork); return ERR_PTR(err); - + } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_sk(sk)->dontfrag; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index db84f523656d..4b15fe928278 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -642,8 +642,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, - rel_info); + skb_dst_update_pmtu(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -861,7 +860,7 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, struct metadata_dst *tun_dst, bool log_ecn_err) { - return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate, + return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); @@ -979,6 +978,9 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, int ret = 0; struct net *net = t->net; + if (t->parms.collect_md) + return 1; + if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { @@ -1074,10 +1076,11 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } - } else if (!(t->parms.flags & - (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { - /* enable the cache only only if the routing decision does - * not depend on the current inner header value + } else if (t->parms.proto != 0 && !(t->parms.flags & + (IP6_TNL_F_USE_ORIG_TCLASS | + IP6_TNL_F_USE_ORIG_FWMARK))) { + /* enable the cache only if neither the outer protocol nor the + * routing decision depends on the current inner header value */ use_cache = true; } @@ -1123,10 +1126,14 @@ route_lookup: max_headroom += 8; mtu -= 8; } - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - if (skb_dst(skb) && !t->parms.collect_md) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + } else if (mtu < 576) { + mtu = 576; + } + + skb_dst_update_pmtu(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; @@ -1671,11 +1678,11 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); - if (tnl->parms.proto == IPPROTO_IPIP) { - if (new_mtu < ETH_MIN_MTU) + if (tnl->parms.proto == IPPROTO_IPV6) { + if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { - if (new_mtu < IPV6_MIN_MTU) + if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (new_mtu > 0xFFF8 - dev->hard_header_len) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index dbb74f3c57a7..fa3ae1cb50d3 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -483,7 +483,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (!skb->ignore_df && skb->len > mtu) { - skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) @@ -626,6 +626,7 @@ static void vti6_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; + struct net_device *tdev = NULL; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); @@ -638,6 +639,25 @@ static void vti6_link_config(struct ip6_tnl *t) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + int strict = (ipv6_addr_type(&p->raddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); + struct rt6_info *rt = rt6_lookup(t->net, + &p->raddr, &p->laddr, + p->link, strict); + + if (rt) + tdev = rt->dst.dev; + ip6_rt_put(rt); + } + + if (!tdev && p->link) + tdev = __dev_get_by_index(t->net, p->link); + + if (tdev) + dev->mtu = max_t(int, tdev->mtu - dev->hard_header_len, + IPV6_MIN_MTU); } /** diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a2e1a864eb46..9f6cace9c817 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -477,7 +477,6 @@ static int ip6mr_vif_open(struct inode *inode, struct file *file) } static const struct file_operations ip6mr_vif_fops = { - .owner = THIS_MODULE, .open = ip6mr_vif_open, .read = seq_read, .llseek = seq_lseek, @@ -495,6 +494,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) return ERR_PTR(-ENOENT); it->mrt = mrt; + it->cache = NULL; return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) : SEQ_START_TOKEN; } @@ -609,7 +609,6 @@ static int ipmr_mfc_open(struct inode *inode, struct file *file) } static const struct file_operations ip6mr_mfc_fops = { - .owner = THIS_MODULE, .open = ipmr_mfc_open, .read = seq_read, .llseek = seq_lseek, @@ -1425,10 +1424,13 @@ int __init ip6_mr_init(void) goto add_proto_fail; } #endif - rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, - ip6mr_rtm_dumproute, 0); - return 0; + err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, + NULL, ip6mr_rtm_dumproute, 0); + if (err == 0) + return 0; + #ifdef CONFIG_IPV6_PIMSM_V2 + inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b9404feabd78..e8ffb5b5d84e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -886,6 +886,7 @@ pref_skip_coa: break; case IPV6_AUTOFLOWLABEL: np->autoflowlabel = valbool; + np->autoflowlabel_set = 1; retv = 0; break; case IPV6_RECVFRAGSIZE: @@ -1335,7 +1336,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_AUTOFLOWLABEL: - val = np->autoflowlabel; + val = ip6_autoflowlabel(sock_net(sk), np); break; case IPV6_RECVFRAGSIZE: diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 844642682b83..6a5d0e39bb87 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1655,8 +1655,6 @@ static void mld_sendpack(struct sk_buff *skb) if (err) goto err_out; - payload_len = skb->len; - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, net->ipv6.igmp_sk, skb, NULL, skb->dev, dst_output); @@ -2758,7 +2756,6 @@ static int igmp6_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp6_mc_seq_fops = { - .owner = THIS_MODULE, .open = igmp6_mc_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2913,7 +2910,6 @@ static int igmp6_mcf_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp6_mcf_seq_fops = { - .owner = THIS_MODULE, .open = igmp6_mcf_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b3cea200c85e..f61a5b613b52 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -566,6 +566,11 @@ static void ndisc_send_unsol_na(struct net_device *dev) read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { + /* skip tentative addresses until dad completes */ + if (ifa->flags & IFA_F_TENTATIVE && + !(ifa->flags & IFA_F_OPTIMISTIC)) + continue; + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 39970e212ad5..d95ceca7ff8f 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -68,32 +68,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) } EXPORT_SYMBOL(ip6_route_me_harder); -/* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - -struct ip6_rt_info { - struct in6_addr daddr; - struct in6_addr saddr; - u_int32_t mark; -}; - -static void nf_ip6_saveroute(const struct sk_buff *skb, - struct nf_queue_entry *entry) -{ - struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct ipv6hdr *iph = ipv6_hdr(skb); - - rt_info->daddr = iph->daddr; - rt_info->saddr = iph->saddr; - rt_info->mark = skb->mark; - } -} - -static int nf_ip6_reroute(struct net *net, struct sk_buff *skb, +static int nf_ip6_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -103,7 +78,7 @@ static int nf_ip6_reroute(struct net *net, struct sk_buff *skb, if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || skb->mark != rt_info->mark) - return ip6_route_me_harder(net, skb); + return ip6_route_me_harder(entry->state.net, skb); } return 0; } @@ -190,25 +165,19 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, }; static const struct nf_ipv6_ops ipv6ops = { - .chk_addr = ipv6_chk_addr, - .route_input = ip6_route_input, - .fragment = ip6_fragment -}; - -static const struct nf_afinfo nf_ip6_afinfo = { - .family = AF_INET6, + .chk_addr = ipv6_chk_addr, + .route_input = ip6_route_input, + .fragment = ip6_fragment, .checksum = nf_ip6_checksum, .checksum_partial = nf_ip6_checksum_partial, .route = nf_ip6_route, - .saveroute = nf_ip6_saveroute, .reroute = nf_ip6_reroute, - .route_key_size = sizeof(struct ip6_rt_info), }; int __init ipv6_netfilter_init(void) { RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); - return nf_register_afinfo(&nf_ip6_afinfo); + return 0; } /* This can be called from inet6_init() on errors, so it cannot @@ -217,5 +186,4 @@ int __init ipv6_netfilter_init(void) void ipv6_netfilter_fini(void) { RCU_INIT_POINTER(nf_ipv6_ops, NULL); - nf_unregister_afinfo(&nf_ip6_afinfo); } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6acb2eecd986..4a634b7a2c80 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -71,6 +71,15 @@ config NFT_FIB_IPV6 endif # NF_TABLES_IPV6 endif # NF_TABLES +config NF_FLOW_TABLE_IPV6 + tristate "Netfilter flow table IPv6 module" + depends on NF_CONNTRACK && NF_TABLES + select NF_FLOW_TABLE + help + This option adds the flow table IPv6 support. + + To compile it as a module, choose M here. + config NF_DUP_IPV6 tristate "Netfilter IPv6 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK @@ -232,6 +241,15 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_MATCH_SRH + tristate '"srh" Segment Routing header match support' + depends on NETFILTER_ADVANCED + help + srh matching allows you to match packets based on the segment + routing header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + # The targets config IP6_NF_TARGET_HL tristate '"HL" hoplimit target support' diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index c6ee0cdd0ba9..d984057b8395 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -45,6 +45,9 @@ obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o +# flow table support +obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o + # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o @@ -54,6 +57,7 @@ obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o +obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o # targets obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1d7ae9366335..af4c917e0836 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -282,12 +282,7 @@ ip6t_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = table->private; - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); + private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; @@ -991,9 +986,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(AF_INET6); #endif - t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), - "ip6table_%s", name); - if (t) { + t = xt_request_find_table_lock(net, AF_INET6, name); + if (!IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1023,7 +1017,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET6); @@ -1049,7 +1043,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); - if (t) { + if (!IS_ERR(t)) { struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1060,7 +1054,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -1083,10 +1077,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto out; } - t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), - "ip6table_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, AF_INET6, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1199,8 +1192,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET6, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1636,7 +1629,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); @@ -1650,7 +1643,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(AF_INET6); return ret; @@ -1954,7 +1947,6 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err5; - pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c new file mode 100644 index 000000000000..9642164107ce --- /dev/null +++ b/net/ipv6/netfilter/ip6t_srh.c @@ -0,0 +1,161 @@ +/* Kernel module to match Segment Routing Header (SRH) parameters. */ + +/* Author: + * Ahmed Abdelsalam <amsalam20@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/ipv6.h> +#include <net/seg6.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6t_srh.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +/* Test a struct->mt_invflags and a boolean for inequality */ +#define NF_SRH_INVF(ptr, flag, boolean) \ + ((boolean) ^ !!((ptr)->mt_invflags & (flag))) + +static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_srh *srhinfo = par->matchinfo; + struct ipv6_sr_hdr *srh; + struct ipv6_sr_hdr _srh; + int hdrlen, srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return false; + srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); + if (!srh) + return false; + + hdrlen = ipv6_optlen(srh); + if (skb->len - srhoff < hdrlen) + return false; + + if (srh->type != IPV6_SRCRT_TYPE_4) + return false; + + if (srh->segments_left > srh->first_segment) + return false; + + /* Next Header matching */ + if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, + !(srh->nexthdr == srhinfo->next_hdr))) + return false; + + /* Header Extension Length matching */ + if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, + !(srh->hdrlen == srhinfo->hdr_len))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, + !(srh->hdrlen > srhinfo->hdr_len))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, + !(srh->hdrlen < srhinfo->hdr_len))) + return false; + + /* Segments Left matching */ + if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, + !(srh->segments_left == srhinfo->segs_left))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, + !(srh->segments_left > srhinfo->segs_left))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, + !(srh->segments_left < srhinfo->segs_left))) + return false; + + /** + * Last Entry matching + * Last_Entry field was introduced in revision 6 of the SRH draft. + * It was called First_Segment in the previous revision + */ + if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, + !(srh->first_segment == srhinfo->last_entry))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, + !(srh->first_segment > srhinfo->last_entry))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, + !(srh->first_segment < srhinfo->last_entry))) + return false; + + /** + * Tag matchig + * Tag field was introduced in revision 6 of the SRH draft. + */ + if (srhinfo->mt_flags & IP6T_SRH_TAG) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, + !(srh->tag == srhinfo->tag))) + return false; + return true; +} + +static int srh_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_srh *srhinfo = par->matchinfo; + + if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { + pr_err("unknown srh match flags %X\n", srhinfo->mt_flags); + return -EINVAL; + } + + if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { + pr_err("unknown srh invflags %X\n", srhinfo->mt_invflags); + return -EINVAL; + } + + return 0; +} + +static struct xt_match srh_mt6_reg __read_mostly = { + .name = "srh", + .family = NFPROTO_IPV6, + .match = srh_mt6, + .matchsize = sizeof(struct ip6t_srh), + .checkentry = srh_mt6_check, + .me = THIS_MODULE, +}; + +static int __init srh_mt6_init(void) +{ + return xt_register_match(&srh_mt6_reg); +} + +static void __exit srh_mt6_exit(void) +{ + xt_unregister_match(&srh_mt6_reg); +} + +module_init(srh_mt6_init); +module_exit(srh_mt6_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match"); +MODULE_AUTHOR("Ahmed Abdelsalam <amsalam20@gmail.com>"); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 2b1a9dcdbcb3..b0524b18c4fb 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -42,14 +42,6 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) u_int8_t hop_limit; u_int32_t flowlabel, mark; int err; -#if 0 - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) { - net_warn_ratelimited("ip6t_hook: happy cracking\n"); - return NF_ACCEPT; - } -#endif /* save source/dest address, mark, hoplimit, flowlabel, priority, */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 991512576c8c..47306e45a80a 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -74,6 +74,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_in, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, @@ -81,6 +82,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_out, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, @@ -88,12 +90,14 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_local_fn, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = ip6table_nat_fn, + .nat_hook = true, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index d4bc56443dc1..710fa0806c37 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -3,6 +3,7 @@ * * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> @@ -11,6 +12,10 @@ static int __net_init ip6table_raw_table_init(struct net *net); +static bool raw_before_defrag __read_mostly; +MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); +module_param(raw_before_defrag, bool, 0000); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, @@ -20,6 +25,15 @@ static const struct xt_table packet_raw = { .table_init = ip6table_raw_table_init, }; +static const struct xt_table packet_raw_before_defrag = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG, + .table_init = ip6table_raw_table_init, +}; + /* The work comes in here from netfilter.c. */ static unsigned int ip6table_raw_hook(void *priv, struct sk_buff *skb, @@ -33,15 +47,19 @@ static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; + const struct xt_table *table = &packet_raw; int ret; + if (raw_before_defrag) + table = &packet_raw_before_defrag; + if (net->ipv6.ip6table_raw) return 0; - repl = ip6t_alloc_initial_table(&packet_raw); + repl = ip6t_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops, + ret = ip6t_register_table(net, table, repl, rawtable_ops, &net->ipv6.ip6table_raw); kfree(repl); return ret; @@ -62,9 +80,16 @@ static struct pernet_operations ip6table_raw_net_ops = { static int __init ip6table_raw_init(void) { int ret; + const struct xt_table *table = &packet_raw; + + if (raw_before_defrag) { + table = &packet_raw_before_defrag; + + pr_info("Enabling raw table before defrag\n"); + } /* Register hooks */ - rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook); + rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook); if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 3b80a38f62b8..11a313fd9273 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -176,11 +176,6 @@ static unsigned int ipv6_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) { - net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); - return NF_ACCEPT; - } return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } @@ -368,7 +363,7 @@ static struct nf_sockopt_ops so_getorigdst6 = { .owner = THIS_MODULE, }; -static struct nf_conntrack_l4proto *builtin_l4proto6[] = { +static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = { &nf_conntrack_l4proto_tcp6, &nf_conntrack_l4proto_udp6, &nf_conntrack_l4proto_icmpv6, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 3ac0d826afc4..2548e2c8aedd 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -27,7 +27,7 @@ #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> #include <net/netfilter/nf_log.h> -static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ; +static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) { @@ -352,7 +352,7 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.icmpv6.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_ICMPV6, diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 977d8900cfd1..ce53dcfda88a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -231,7 +231,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, if ((unsigned int)end > IPV6_MAXPLEN) { pr_debug("offset is too large.\n"); - return -1; + return -EINVAL; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); @@ -264,7 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - return -1; + return -EPROTO; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ @@ -358,7 +358,7 @@ found: discard_fq: inet_frag_kill(&fq->q, &nf_frags); err: - return -1; + return -EINVAL; } /* @@ -567,6 +567,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { + u16 savethdr = skb->transport_header; struct net_device *dev = skb->dev; int fhoff, nhoff, ret; struct frag_hdr *fhdr; @@ -600,8 +601,12 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) spin_lock_bh(&fq->q.lock); - if (nf_ct_frag6_queue(fq, skb, fhdr, nhoff) < 0) { - ret = -EINVAL; + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); + if (ret < 0) { + if (ret == -EPROTO) { + skb->transport_header = savethdr; + ret = 0; + } goto out_unlock; } diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index b326da59257f..c87b48359e8f 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -63,6 +63,9 @@ static unsigned int ipv6_defrag(void *priv, /* Previously seen (loopback)? */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; + + if (skb->_nfct == IP_CT_UNTRACKED) + return NF_ACCEPT; #endif err = nf_ct_frag6_gather(state->net, skb, diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c new file mode 100644 index 000000000000..fff21602875a --- /dev/null +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -0,0 +1,277 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/neighbour.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> + +static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + + return 0; +} + +static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, struct in6_addr *addr, + struct in6_addr *new_addr) +{ + switch (ip6h->nexthdr) { + case IPPROTO_TCP: + if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; + ip6h->daddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_dnat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6; + ip6h->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_nat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, + enum flow_offload_tuple_dir dir) +{ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + unsigned int thoff = sizeof(*ip6h); + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + + return 0; +} + +static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + struct ipv6hdr *ip6h; + unsigned int thoff; + + if (!pskb_may_pull(skb, sizeof(*ip6h))) + return -1; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_TCP && + ip6h->nexthdr != IPPROTO_UDP) + return -1; + + thoff = sizeof(*ip6h); + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v6 = ip6h->saddr; + tuple->dst_v6 = ip6h->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET6; + tuple->l4proto = ip6h->nexthdr; + tuple->iifidx = dev->ifindex; + + return 0; +} + +/* Based on ip_exceeds_mtu(). */ +static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + return false; + + return true; +} + +static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt) +{ + u32 mtu; + + mtu = ip6_dst_mtu_forward(&rt->dst); + if (__nf_flow_exceeds_mtu(skb, mtu)) + return true; + + return false; +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + struct in6_addr *nexthop; + struct ipv6hdr *ip6h; + struct rt6_info *rt; + + if (skb->protocol != htons(ETH_P_IPV6)) + return NF_ACCEPT; + + if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + + rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; + if (unlikely(nf_flow_exceeds_mtu(skb, rt))) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*ip6h))) + return NF_DROP; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ipv6(flow, skb, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + ip6h = ipv6_hdr(skb); + ip6h->hop_limit--; + + skb->dev = outdev; + nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); + +static struct nf_flowtable_type flowtable_ipv6 = { + .family = NFPROTO_IPV6, + .params = &nf_flow_offload_rhash_params, + .gc = nf_flow_offload_work_gc, + .hook = nf_flow_offload_ipv6_hook, + .owner = THIS_MODULE, +}; + +static int __init nf_flow_ipv6_module_init(void) +{ + nft_register_flowtable_type(&flowtable_ipv6); + + return 0; +} + +static void __exit nf_flow_ipv6_module_exit(void) +{ + nft_unregister_flowtable_type(&flowtable_ipv6); +} + +module_init(nf_flow_ipv6_module_init); +module_exit(nf_flow_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET6); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 1d2fb9267d6f..bed57ee65f7b 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -369,10 +369,6 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, #endif unsigned int ret; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && @@ -408,10 +404,6 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, unsigned int ret; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index d6e4ba5de916..17e03589331c 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -22,68 +22,12 @@ static unsigned int nft_do_chain_ipv6(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv6_output(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (unlikely(skb->len < sizeof(struct ipv6hdr))) { - if (net_ratelimit()) - pr_info("nf_tables_ipv6: ignoring short SOCK_RAW " - "packet\n"); - return NF_ACCEPT; - } - - return nft_do_chain_ipv6(priv, skb, state); -} - -struct nft_af_info nft_af_ipv6 __read_mostly = { - .family = NFPROTO_IPV6, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, - [NF_INET_LOCAL_OUT] = nft_ipv6_output, - [NF_INET_FORWARD] = nft_do_chain_ipv6, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, - }, -}; -EXPORT_SYMBOL_GPL(nft_af_ipv6); - -static int nf_tables_ipv6_init_net(struct net *net) -{ - net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.ipv6 == NULL) - return -ENOMEM; - - memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6)); - - if (nft_register_afinfo(net, net->nft.ipv6) < 0) - goto err; - - return 0; -err: - kfree(net->nft.ipv6); - return -ENOMEM; -} - -static void nf_tables_ipv6_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.ipv6); - kfree(net->nft.ipv6); -} - -static struct pernet_operations nf_tables_ipv6_net_ops = { - .init = nf_tables_ipv6_init_net, - .exit = nf_tables_ipv6_exit_net, -}; - static const struct nf_chain_type filter_ipv6 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -94,26 +38,22 @@ static const struct nf_chain_type filter_ipv6 = { (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, + [NF_INET_FORWARD] = nft_do_chain_ipv6, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, + }, }; static int __init nf_tables_ipv6_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_ipv6); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_ipv6); - - return ret; + return nft_register_chain_type(&filter_ipv6); } static void __exit nf_tables_ipv6_exit(void) { - unregister_pernet_subsys(&nf_tables_ipv6_net_ops); nft_unregister_chain_type(&filter_ipv6); } @@ -122,4 +62,4 @@ module_exit(nf_tables_ipv6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_INET6); +MODULE_ALIAS_NFT_CHAIN(AF_INET6, "filter"); diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 443cd306c0b0..73fe2bd13fcf 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -31,7 +31,8 @@ static unsigned int nft_nat_do_chain(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); return nft_do_chain(&pkt, priv); } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index f2727475895e..11d3c3b9aa18 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -33,7 +33,8 @@ static unsigned int nf_route_table_hook(void *priv, u32 mark, flowlabel; int err; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 54b5899543ef..cc5174c7254c 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -60,7 +60,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, { const struct net_device *dev = NULL; const struct nf_ipv6_ops *v6ops; - const struct nf_afinfo *afinfo; int route_err, addrtype; struct rt6_info *rt; struct flowi6 fl6 = { @@ -69,8 +68,8 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, }; u32 ret = 0; - afinfo = nf_get_afinfo(NFPROTO_IPV6); - if (!afinfo) + v6ops = nf_get_ipv6_ops(); + if (!v6ops) return RTN_UNREACHABLE; if (priv->flags & NFTA_FIB_F_IIF) @@ -80,12 +79,11 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); - v6ops = nf_get_ipv6_ops(); - if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) + if (dev && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) ret = RTN_LOCAL; - route_err = afinfo->route(nft_net(pkt), (struct dst_entry **)&rt, - flowi6_to_flowi(&fl6), false); + route_err = v6ops->route(nft_net(pkt), (struct dst_entry **)&rt, + flowi6_to_flowi(&fl6), false); if (route_err) goto err; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index e88bcb8ff0fd..b67814242f78 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -58,7 +58,6 @@ static int sockstat6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations sockstat6_seq_fops = { - .owner = THIS_MODULE, .open = sockstat6_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -248,7 +247,6 @@ static int snmp6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations snmp6_seq_fops = { - .owner = THIS_MODULE, .open = snmp6_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -274,7 +272,6 @@ static int snmp6_dev_seq_open(struct inode *inode, struct file *file) } static const struct file_operations snmp6_dev_seq_fops = { - .owner = THIS_MODULE, .open = snmp6_dev_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 761a473a07c5..ddda7eb3c623 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1308,7 +1308,6 @@ static int raw6_seq_open(struct inode *inode, struct file *file) } static const struct file_operations raw6_seq_fops = { - .owner = THIS_MODULE, .open = raw6_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7a8d1500d374..fb2d251c0500 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -186,7 +186,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) { - return dst_metrics_write_ptr(rt->dst.from); + return dst_metrics_write_ptr(&rt->from->dst); } static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) @@ -391,7 +391,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; struct rt6_exception_bucket *bucket; - struct dst_entry *from = dst->from; + struct rt6_info *from = rt->from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); @@ -409,8 +409,8 @@ static void ip6_dst_destroy(struct dst_entry *dst) kfree(bucket); } - dst->from = NULL; - dst_release(from); + rt->from = NULL; + dst_release(&from->dst); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -443,9 +443,9 @@ static bool rt6_check_expired(const struct rt6_info *rt) if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; - } else if (rt->dst.from) { + } else if (rt->from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - rt6_check_expired((struct rt6_info *)rt->dst.from); + rt6_check_expired(rt->from); } return false; } @@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, int strict) { struct rt6_info *sibling, *next_sibling; - int route_choosen; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -463,26 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(fl6, NULL); - route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); - /* Don't change the route, if route_choosen == 0 - * (siblings does not include ourself) - */ - if (route_choosen) - list_for_each_entry_safe(sibling, next_sibling, - &match->rt6i_siblings, rt6i_siblings) { - route_choosen--; - if (route_choosen == 0) { - struct inet6_dev *idev = sibling->rt6i_idev; - - if (!netif_carrier_ok(sibling->dst.dev) && - idev->cnf.ignore_routes_with_linkdown) - break; - if (rt6_score_route(sibling, oif, strict) < 0) - break; - match = sibling; - break; - } - } + if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + return match; + + list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, + rt6i_siblings) { + if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + continue; + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; + break; + } + return match; } @@ -499,12 +491,15 @@ static inline struct rt6_info *rt6_device_match(struct net *net, struct rt6_info *local = NULL; struct rt6_info *sprt; - if (!oif && ipv6_addr_any(saddr)) - goto out; + if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) + return rt; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { struct net_device *dev = sprt->dst.dev; + if (sprt->rt6i_nh_flags & RTNH_F_DEAD) + continue; + if (oif) { if (dev->ifindex == oif) return sprt; @@ -533,8 +528,8 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (flags & RT6_LOOKUP_F_IFACE) return net->ipv6.ip6_null_entry; } -out: - return rt; + + return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -679,10 +674,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, int m; bool match_do_rr = false; struct inet6_dev *idev = rt->rt6i_idev; - struct net_device *dev = rt->dst.dev; - if (dev && !netif_carrier_ok(dev) && - idev->cnf.ignore_routes_with_linkdown && + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + goto out; + + if (idev->cnf.ignore_routes_with_linkdown && + rt->rt6i_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; @@ -721,7 +718,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -731,7 +728,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, } for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->dst.rt6_next)) { + rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -743,7 +740,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) + for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -781,7 +778,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); + struct rt6_info *next = rcu_dereference(rt0->rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@ -1054,7 +1051,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); @@ -1274,7 +1271,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, /* ort can't be a cache or pcpu route */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); spin_lock_bh(&rt6_exception_lock); @@ -1346,7 +1343,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { + spin_lock_bh(&ort->rt6i_table->tb6_lock); fib6_update_sernum(ort); + spin_unlock_bh(&ort->rt6i_table->tb6_lock); fib6_force_start_gc(net); } @@ -1415,8 +1414,8 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, /* Remove the passed in cached rt from the hash table that contains it */ int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; int err; @@ -1460,8 +1459,8 @@ int rt6_remove_exception_rt(struct rt6_info *rt) */ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1586,12 +1585,19 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when * expired, independently from their aging, as per RFC 8201 section 4 */ - if (!(rt->rt6i_flags & RTF_EXPIRES) && - time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { - RT6_TRACE("aging clone %p\n", rt); + if (!(rt->rt6i_flags & RTF_EXPIRES)) { + if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + RT6_TRACE("aging clone %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } + } else if (time_after(jiffies, rt->dst.expires)) { + RT6_TRACE("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; - } else if (rt->rt6i_flags & RTF_GATEWAY) { + } + + if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh; __u8 neigh_flags = 0; @@ -1606,11 +1612,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, rt6_remove_exception(bucket, rt6_ex); return; } - } else if (__rt6_check_expired(rt)) { - RT6_TRACE("purging expired route %p\n", rt); - rt6_remove_exception(bucket, rt6_ex); - return; } + gc_args->more++; } @@ -1824,10 +1827,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) if (skb) { ip6_multipath_l3_keys(skb, &hash_keys); - return flow_hash_from_keys(&hash_keys); + return flow_hash_from_keys(&hash_keys) >> 1; } - return get_hash_from_flowi6(fl6); + return get_hash_from_flowi6(fl6) >> 1; } void ip6_route_input(struct sk_buff *skb) @@ -1929,9 +1932,9 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori static void rt6_dst_from_metrics_check(struct rt6_info *rt) { - if (rt->dst.from && - dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) - dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); + if (rt->from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); } static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) @@ -1951,7 +1954,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + rt6_check(rt->from, cookie)) return &rt->dst; else return NULL; @@ -1971,7 +1974,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); @@ -2154,6 +2157,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + continue; if (rt6_check_expired(rt)) continue; if (rt->dst.error) @@ -2336,6 +2341,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, } rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; @@ -2343,7 +2349,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); - /* Add this dst into uncached_list so that rt6_ifdown() can + /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); @@ -2438,7 +2444,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, static struct rt6_info *ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, - const struct in6_addr *gw_addr) + const struct in6_addr *gw_addr, + u32 tbid, int flags) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2447,15 +2454,15 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, }; struct fib6_table *table; struct rt6_info *rt; - int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; - table = fib6_get_table(net, cfg->fc_table); + table = fib6_get_table(net, tbid); if (!table) return NULL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; + flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); /* if table lookup failed, fall back to full lookup */ @@ -2467,6 +2474,82 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, return rt; } +static int ip6_route_check_nh_onlink(struct net *net, + struct fib6_config *cfg, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; + const struct in6_addr *gw_addr = &cfg->fc_gateway; + u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; + struct rt6_info *grt; + int err; + + err = 0; + grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); + if (grt) { + if (grt->rt6i_flags & flags || dev != grt->dst.dev) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + err = -EINVAL; + } + + ip6_rt_put(grt); + } + + return err; +} + +static int ip6_route_check_nh(struct net *net, + struct fib6_config *cfg, + struct net_device **_dev, + struct inet6_dev **idev) +{ + const struct in6_addr *gw_addr = &cfg->fc_gateway; + struct net_device *dev = _dev ? *_dev : NULL; + struct rt6_info *grt = NULL; + int err = -EHOSTUNREACH; + + if (cfg->fc_table) { + int flags = RT6_LOOKUP_F_IFACE; + + grt = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + + if (!grt) + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + + if (!grt) + goto out; + + if (dev) { + if (dev != grt->dst.dev) { + ip6_rt_put(grt); + goto out; + } + } else { + *_dev = dev = grt->dst.dev; + *idev = grt->rt6i_idev; + dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); + } + + if (!(grt->rt6i_flags & RTF_GATEWAY)) + err = 0; + + ip6_rt_put(grt); + +out: + return err; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -2518,6 +2601,21 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_metric == 0) cfg->fc_metric = IP6_RT_PRIO_USER; + if (cfg->fc_flags & RTNH_F_ONLINK) { + if (!dev) { + NL_SET_ERR_MSG(extack, + "Nexthop device required for onlink"); + err = -ENODEV; + goto out; + } + + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + } + err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { @@ -2592,6 +2690,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, #endif rt->rt6i_metric = cfg->fc_metric; + rt->rt6i_nh_weight = 1; /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes @@ -2661,8 +2760,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - struct rt6_info *grt = NULL; - /* IPv6 strictly inhibits using not link-local addresses as nexthop address. Otherwise, router will not able to send redirects. @@ -2679,40 +2776,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } - if (cfg->fc_table) { - grt = ip6_nh_lookup_table(net, cfg, gw_addr); - - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } - } - - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, - cfg->fc_ifindex, 1); - - err = -EHOSTUNREACH; - if (!grt) - goto out; - if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } + if (cfg->fc_flags & RTNH_F_ONLINK) { + err = ip6_route_check_nh_onlink(net, cfg, dev, + extack); } else { - dev = grt->dst.dev; - idev = grt->rt6i_idev; - dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); + err = ip6_route_check_nh(net, cfg, &dev, &idev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - ip6_rt_put(grt); - if (err) goto out; } @@ -2731,6 +2800,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (!dev) goto out; + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + if (!ipv6_addr_any(&cfg->fc_prefsrc)) { if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); @@ -2745,6 +2820,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_flags = cfg->fc_flags; install_route: + if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && + !netif_carrier_ok(dev)) + rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3055,11 +3134,11 @@ out: static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - BUG_ON(from->dst.from); + BUG_ON(from->from); rt->rt6i_flags &= ~RTF_EXPIRES; dst_hold(&from->dst); - rt->dst.from = &from->dst; + rt->from = from; dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); } @@ -3458,37 +3537,249 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) fib6_clean_all(net, fib6_clean_tohost, gateway); } -struct arg_dev_net { - struct net_device *dev; - struct net *net; +struct arg_netdev_event { + const struct net_device *dev; + union { + unsigned int nh_flags; + unsigned long event; + }; }; +static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +{ + struct rt6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + iter = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + while (iter) { + if (iter->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference_protected(iter->rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + } + + return NULL; +} + +static bool rt6_is_dead(const struct rt6_info *rt) +{ + if (rt->rt6i_nh_flags & RTNH_F_DEAD || + (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + return true; + + return false; +} + +static int rt6_multipath_total_weight(const struct rt6_info *rt) +{ + struct rt6_info *iter; + int total = 0; + + if (!rt6_is_dead(rt)) + total += rt->rt6i_nh_weight; + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + if (!rt6_is_dead(iter)) + total += iter->rt6i_nh_weight; + } + + return total; +} + +static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +{ + int upper_bound = -1; + + if (!rt6_is_dead(rt)) { + *weight += rt->rt6i_nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, + total) - 1; + } + atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); +} + +static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +{ + struct rt6_info *iter; + int weight = 0; + + rt6_upper_bound_set(rt, &weight, total); + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + rt6_upper_bound_set(iter, &weight, total); +} + +void rt6_multipath_rebalance(struct rt6_info *rt) +{ + struct rt6_info *first; + int total; + + /* In case the entire multipath route was marked for flushing, + * then there is no need to rebalance upon the removal of every + * sibling route. + */ + if (!rt->rt6i_nsiblings || rt->should_flush) + return; + + /* During lookup routes are evaluated in order, so we need to + * make sure upper bounds are assigned from the first sibling + * onwards. + */ + first = rt6_multipath_first_sibling(rt); + if (WARN_ON_ONCE(!first)) + return; + + total = rt6_multipath_total_weight(first); + rt6_multipath_upper_bound_set(first, total); +} + +static int fib6_ifup(struct rt6_info *rt, void *p_arg) +{ + const struct arg_netdev_event *arg = p_arg; + const struct net *net = dev_net(arg->dev); + + if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { + rt->rt6i_nh_flags &= ~arg->nh_flags; + fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + rt6_multipath_rebalance(rt); + } + + return 0; +} + +void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) +{ + struct arg_netdev_event arg = { + .dev = dev, + { + .nh_flags = nh_flags, + }, + }; + + if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) + arg.nh_flags |= RTNH_F_LINKDOWN; + + fib6_clean_all(dev_net(dev), fib6_ifup, &arg); +} + +static bool rt6_multipath_uses_dev(const struct rt6_info *rt, + const struct net_device *dev) +{ + struct rt6_info *iter; + + if (rt->dst.dev == dev) + return true; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == dev) + return true; + + return false; +} + +static void rt6_multipath_flush(struct rt6_info *rt) +{ + struct rt6_info *iter; + + rt->should_flush = 1; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + iter->should_flush = 1; +} + +static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, + const struct net_device *down_dev) +{ + struct rt6_info *iter; + unsigned int dead = 0; + + if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) + dead++; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == down_dev || + iter->rt6i_nh_flags & RTNH_F_DEAD) + dead++; + + return dead; +} + +static void rt6_multipath_nh_flags_set(struct rt6_info *rt, + const struct net_device *dev, + unsigned int nh_flags) +{ + struct rt6_info *iter; + + if (rt->dst.dev == dev) + rt->rt6i_nh_flags |= nh_flags; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == dev) + iter->rt6i_nh_flags |= nh_flags; +} + /* called with write lock held for table with rt */ -static int fib6_ifdown(struct rt6_info *rt, void *arg) +static int fib6_ifdown(struct rt6_info *rt, void *p_arg) { - const struct arg_dev_net *adn = arg; - const struct net_device *dev = adn->dev; + const struct arg_netdev_event *arg = p_arg; + const struct net_device *dev = arg->dev; + const struct net *net = dev_net(dev); - if ((rt->dst.dev == dev || !dev) && - rt != adn->net->ipv6.ip6_null_entry && - (rt->rt6i_nsiblings == 0 || - (dev && netdev_unregistering(dev)) || - !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) - return -1; + if (rt == net->ipv6.ip6_null_entry) + return 0; + + switch (arg->event) { + case NETDEV_UNREGISTER: + return rt->dst.dev == dev ? -1 : 0; + case NETDEV_DOWN: + if (rt->should_flush) + return -1; + if (!rt->rt6i_nsiblings) + return rt->dst.dev == dev ? -1 : 0; + if (rt6_multipath_uses_dev(rt, dev)) { + unsigned int count; + + count = rt6_multipath_dead_count(rt, dev); + if (rt->rt6i_nsiblings + 1 == count) { + rt6_multipath_flush(rt); + return -1; + } + rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | + RTNH_F_LINKDOWN); + fib6_update_sernum(rt); + rt6_multipath_rebalance(rt); + } + return -2; + case NETDEV_CHANGE: + if (rt->dst.dev != dev || + rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + break; + rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt6_multipath_rebalance(rt); + break; + } return 0; } -void rt6_ifdown(struct net *net, struct net_device *dev) +void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { - struct arg_dev_net adn = { + struct arg_netdev_event arg = { .dev = dev, - .net = net, + { + .event = event, + }, }; - fib6_clean_all(net, fib6_ifdown, &adn); - if (dev) - rt6_uncached_list_flush_dev(net, dev); + fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); +} + +void rt6_disable_ip(struct net_device *dev, unsigned long event) +{ + rt6_sync_down_dev(dev, event); + rt6_uncached_list_flush_dev(dev_net(dev), dev); + neigh_ifdown(&nd_tbl, dev); } struct rt6_mtu_change_arg { @@ -3602,6 +3893,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_CLONED) cfg->fc_flags |= RTF_CACHE; + cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); @@ -3811,6 +4104,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { dst_release_immediate(&rt->dst); @@ -3991,7 +4286,10 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, unsigned int *flags, bool skip_oif) { - if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + *flags |= RTNH_F_DEAD; + + if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) *flags |= RTNH_F_DEAD; @@ -4002,6 +4300,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, goto nla_put_failure; } + *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; @@ -4030,7 +4329,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = 0; + rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0) @@ -4297,19 +4596,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - if (!fibmatch) - dst = ip6_route_input_lookup(net, dev, &fl6, flags); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_input_lookup(net, dev, &fl6, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; - if (!fibmatch) - dst = ip6_route_output(net, NULL, &fl6); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_output(net, NULL, &fl6); } @@ -4326,6 +4619,14 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + if (fibmatch && rt->from) { + struct rt6_info *ort = rt->from; + + dst_hold(&ort->dst); + ip6_rt_put(rt); + rt = ort; + } + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); @@ -4423,7 +4724,6 @@ static int ip6_route_dev_notify(struct notifier_block *this, #ifdef CONFIG_PROC_FS static const struct file_operations ipv6_route_proc_fops = { - .owner = THIS_MODULE, .open = ipv6_route_open, .read = seq_read, .llseek = seq_lseek, @@ -4451,7 +4751,6 @@ static int rt6_stats_seq_open(struct inode *inode, struct file *file) } static const struct file_operations rt6_stats_seq_fops = { - .owner = THIS_MODULE, .open = rt6_stats_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -4596,8 +4895,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_ip6_dst_entries; - net->ipv6.ip6_null_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); @@ -4609,8 +4906,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; - net->ipv6.ip6_prohibit_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_prohibit_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); @@ -4620,8 +4915,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; - net->ipv6.ip6_blk_hole_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); @@ -4778,11 +5071,20 @@ int __init ip6_route_init(void) if (ret) goto fib6_rules_init; - ret = -ENOBUFS; - if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || - __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || - __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED)) + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, + inet6_rtm_newroute, NULL, 0); + if (ret < 0) + goto out_register_late_subsys; + + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, + inet6_rtm_delroute, NULL, 0); + if (ret < 0) + goto out_register_late_subsys; + + ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, + inet6_rtm_getroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED); + if (ret < 0) goto out_register_late_subsys; ret = register_netdevice_notifier(&ip6_route_dev_notifier); @@ -4800,6 +5102,7 @@ out: return ret; out_register_late_subsys: + rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index c81407770956..7f5621d09571 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -306,9 +306,7 @@ static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) struct seg6_hmac_info *hinfo; int ret; - ret = rhashtable_walk_start(iter); - if (ret && ret != -EAGAIN) - goto done; + rhashtable_walk_start(iter); for (;;) { hinfo = rhashtable_walk_next(iter); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 825b8e01f947..ba3767ef5e93 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -501,7 +501,7 @@ static struct seg6_action_desc *__get_action_desc(int action) struct seg6_action_desc *desc; int i, count; - count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc); + count = ARRAY_SIZE(seg6_action_table); for (i = 0; i < count; i++) { desc = &seg6_action_table[i]; if (desc->action == action) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d7dc23c1b2ca..3873d3877135 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -934,8 +934,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, df = 0; } - if (tunnel->parms.iph.daddr && skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (tunnel->parms.iph.daddr) + skb_dst_update_pmtu(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7178476b3d2f..a1ab29e2ab3b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -176,8 +176,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, /* If interface is set while binding, indices * must coincide. */ - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) + if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; @@ -1795,7 +1794,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_expires = jiffies; } - state = sk_state_load(sp); + state = inet_sk_state_load(sp); if (state == TCP_LISTEN) rx_queue = sp->sk_ack_backlog; else @@ -1884,7 +1883,6 @@ out: } static const struct file_operations tcp6_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = tcp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index d883c9204c01..278e49cd67d4 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -46,6 +46,9 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, { struct tcphdr *th; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)) + return ERR_PTR(-EINVAL); + if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3f30fa313bf2..52e3ea0e6f50 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -89,28 +89,12 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -static u32 udp6_portaddr_hash(const struct net *net, - const struct in6_addr *addr6, - unsigned int port) -{ - unsigned int hash, mix = net_hash_mix(net); - - if (ipv6_addr_any(addr6)) - hash = jhash_1word(0, mix); - else if (ipv6_addr_v4mapped(addr6)) - hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); - else - hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); - - return hash ^ port; -} - int udp_v6_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum); + ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum); unsigned int hash2_partial = - udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); + ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -119,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) static void udp_v6_rehash(struct sock *sk) { - u16 new_hash = udp6_portaddr_hash(sock_net(sk), + u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); @@ -184,7 +168,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -193,8 +177,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); @@ -202,15 +185,9 @@ static struct sock *udp6_lib_lookup2(struct net *net, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -228,11 +205,11 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp6_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { - hash2 = udp6_portaddr_hash(net, daddr, hnum); + hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -243,7 +220,7 @@ struct sock *__udp6_lib_lookup(struct net *net, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -267,23 +244,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -719,9 +689,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & + hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) & udptable->mask; - hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -909,7 +879,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); @@ -1509,7 +1479,6 @@ int udp6_seq_show(struct seq_file *seq, void *v) } static const struct file_operations udp6_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a0f89ad76f9d..2a04dc9c781b 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -42,6 +42,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, const struct ipv6hdr *ipv6h; struct udphdr *uh; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + goto out; + if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 2784cc363f2b..14ae32bb1f3d 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -94,7 +94,6 @@ void udplitev6_exit(void) #ifdef CONFIG_PROC_FS static const struct file_operations udplite6_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index fe04e23af986..841f4a07438e 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -32,6 +32,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, } EXPORT_SYMBOL(xfrm6_rcv_spi); +static int xfrm6_transport_finish2(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + if (xfrm_trans_queue(skb, ip6_rcv_finish)) + __kfree_skb(skb); + return -1; +} + int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); @@ -56,7 +64,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_rcv_finish); + xfrm6_transport_finish2); return -1; } diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 02556e356f87..bb935a3b7fea 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -59,7 +59,7 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->hop_limit = ip6_dst_hoplimit(dst->child); + top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = *(struct in6_addr *)&x->props.saddr; top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; @@ -92,6 +92,7 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); + eth_hdr(skb)->h_proto = skb->protocol; err = 0; @@ -105,17 +106,14 @@ static struct sk_buff *xfrm6_mode_tunnel_gso_segment(struct xfrm_state *x, { __skb_push(skb, skb->mac_len); return skb_mac_gso_segment(skb, features); - } static void xfrm6_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); - if (xo->flags & XFRM_GSO_SEGMENT) { - skb->network_header = skb->network_header - x->props.header_len; + if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); - } skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 885ade234a49..09fb44ee3b45 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -265,7 +265,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); - xdst = (struct xfrm_dst *)xdst->u.dst.child; + xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); |