diff options
Diffstat (limited to 'net/ipv6')
-rw-r--r-- | net/ipv6/addrconf.c | 10 | ||||
-rw-r--r-- | net/ipv6/addrconf_core.c | 6 | ||||
-rw-r--r-- | net/ipv6/af_inet6.c | 3 | ||||
-rw-r--r-- | net/ipv6/ip6_fib.c | 103 | ||||
-rw-r--r-- | net/ipv6/ip6_output.c | 315 | ||||
-rw-r--r-- | net/ipv6/ndisc.c | 8 | ||||
-rw-r--r-- | net/ipv6/netfilter.c | 123 | ||||
-rw-r--r-- | net/ipv6/netfilter/nf_conntrack_reasm.c | 43 | ||||
-rw-r--r-- | net/ipv6/proc.c | 4 | ||||
-rw-r--r-- | net/ipv6/reassembly.c | 42 | ||||
-rw-r--r-- | net/ipv6/route.c | 485 |
11 files changed, 737 insertions, 405 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b51630ddb728..6b673d4f5ca9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2421,9 +2421,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) + if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex) continue; - if (no_gw && rt->fib6_nh.fib_nh_gw_family) + if (no_gw && rt->fib6_nh->fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; @@ -6354,16 +6354,16 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct fib6_info *rt = ifa->rt; + struct fib6_nh *nh = ifa->rt->fib6_nh; int cpu; rcu_read_lock(); ifa->rt->dst_nopolicy = val ? true : false; - if (rt->rt6i_pcpu) { + if (nh->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; - rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu); addrconf_set_nopolicy(*rtp, val); } } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 5b1246635e02..783f3c1466da 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +{ + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, + .ip6_del_rt = eafnosupport_ip6_del_rt, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c04ae282f4e4..cc6f8d0c625a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -926,6 +926,9 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, + .fib6_update_sernum = fib6_update_sernum_stub, + .fib6_rt_update = fib6_rt_update, + .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 008421b550c6..cdfb8500ccae 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -147,20 +147,18 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; + size_t sz = sizeof(*f6i); - f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (with_fib6_nh) + sz += sizeof(struct fib6_nh); + + f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; - f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!f6i->rt6i_pcpu) { - kfree(f6i); - return NULL; - } - INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); @@ -170,36 +168,11 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); - struct rt6_exception_bucket *bucket; WARN_ON(f6i->fib6_node); - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - kfree(bucket); - - if (f6i->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } - - free_percpu(f6i->rt6i_pcpu); - } - - fib6_nh_release(&f6i->fib6_nh); - + fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); - kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); @@ -393,10 +366,10 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, return call_fib6_notifier(nb, net, event_type, &info.info); } -static int call_fib6_entry_notifiers(struct net *net, - enum fib_event_type event_type, - struct fib6_info *rt, - struct netlink_ext_ack *extack) +int call_fib6_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, @@ -899,16 +872,14 @@ insert_above: return ln; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, + const struct fib6_info *match, + const struct fib6_table *table) { int cpu; - /* Make sure rt6_make_pcpu_route() wont add other percpu routes - * while we are cleaning them here. - */ - f6i->fib6_destroying = 1; - mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + if (!fib6_nh->rt6i_pcpu) + return; /* release the reference to this fib entry from * all of its cached pcpu routes @@ -917,9 +888,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { + + /* only dropping the 'from' reference if the cached route + * is using 'match'. The cached pcpu_rt->from only changes + * from a fib6_info to NULL (ip6_dst_destroy); it can never + * change from one fib6_info reference to another + */ + if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); @@ -928,13 +905,27 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, } } +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) +{ + struct fib6_nh *fib6_nh; + + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + + fib6_nh = f6i->fib6_nh; + __fib6_drop_pcpu_from(fib6_nh, f6i, table); +} + static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt, table); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1222,6 +1213,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } +/* allow ipv4 to update sernum via ipv6_stub */ +void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum_upto_root(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -2306,14 +2305,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_gw_family) { + if (rt->fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; - seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); + seq_printf(seq, "%pi6", &rt->fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } - dev = rt->fib6_nh.fib_nh_dev; + dev = rt->fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f9e43323e667..0bb6b6de7962 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -592,6 +592,170 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, + struct ip6_fraglist_iter *iter) +{ + unsigned int first_len; + struct frag_hdr *fh; + + /* BUILD HEADER */ + *prevhdr = NEXTHDR_FRAGMENT; + iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!iter->tmp_hdr) + return -ENOMEM; + + iter->frag_list = skb_shinfo(skb)->frag_list; + iter->frag = iter->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->hlen = hlen; + iter->frag_id = frag_id; + iter->nexthdr = nexthdr; + + __skb_pull(skb, hlen); + fh = __skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); + + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + fh->identification = frag_id; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + return 0; +} +EXPORT_SYMBOL(ip6_fraglist_init); + +void ip6_fraglist_prepare(struct sk_buff *skb, + struct ip6_fraglist_iter *iter) +{ + struct sk_buff *frag = iter->frag; + unsigned int hlen = iter->hlen; + struct frag_hdr *fh; + + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = __skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); + iter->offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = iter->nexthdr; + fh->reserved = 0; + fh->frag_off = htons(iter->offset); + if (frag->next) + fh->frag_off |= htons(IP6_MF); + fh->identification = iter->frag_id; + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); +} +EXPORT_SYMBOL(ip6_fraglist_prepare); + +void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, + unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) +{ + state->prevhdr = prevhdr; + state->nexthdr = nexthdr; + state->frag_id = frag_id; + + state->hlen = hlen; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->hroom = hdr_room; + state->troom = needed_tailroom; + + state->offset = 0; +} +EXPORT_SYMBOL(ip6_frag_init); + +struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) +{ + u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; + struct sk_buff *frag; + struct frag_hdr *fh; + unsigned int len; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) + len &= ~7; + + /* Allocate buffer */ + frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + + state->hroom + state->troom, GFP_ATOMIC); + if (!frag) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, state->hroom); + skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); + frag->transport_header = (frag->network_header + state->hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); + + fragnexthdr_offset = skb_network_header(frag); + fragnexthdr_offset += prevhdr - skb_network_header(skb); + *fragnexthdr_offset = NEXTHDR_FRAGMENT; + + /* + * Build fragment header. + */ + fh->nexthdr = state->nexthdr; + fh->reserved = 0; + fh->identification = state->frag_id; + + /* + * Copy a block of the IP datagram. + */ + BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), + len)); + state->left -= len; + + fh->frag_off = htons(state->offset); + if (state->left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + state->ptr += len; + state->offset += len; + + return frag; +} +EXPORT_SYMBOL(ip6_frag_next); + int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { @@ -599,12 +763,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - struct ipv6hdr *tmp_hdr; - struct frag_hdr *fh; - unsigned int mtu, hlen, left, len, nexthdr_offset; - int hroom, troom; + struct ip6_frag_state state; + unsigned int mtu, hlen, nexthdr_offset; + int hroom, err = 0; __be32 frag_id; - int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); @@ -651,6 +813,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || @@ -678,74 +841,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb->truesize -= frag->truesize; } - err = 0; - offset = 0; - /* BUILD HEADER */ - - *prevhdr = NEXTHDR_FRAGMENT; - tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); - if (!tmp_hdr) { - err = -ENOMEM; + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) goto fail; - } - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - - __skb_pull(skb, hlen); - fh = __skb_push(skb, sizeof(struct frag_hdr)); - __skb_push(skb, hlen); - skb_reset_network_header(skb); - memcpy(skb_network_header(skb), tmp_hdr, hlen); - - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(IP6_MF); - fh->identification = frag_id; - - first_len = skb_pagelen(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - ipv6_hdr(skb)->payload_len = htons(first_len - - sizeof(struct ipv6hdr)); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - fh = __skb_push(frag, sizeof(struct frag_hdr)); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), tmp_hdr, - hlen); - offset += skb->len - hlen - sizeof(struct frag_hdr); - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(offset); - if (frag->next) - fh->frag_off |= htons(IP6_MF); - fh->identification = frag_id; - ipv6_hdr(frag)->payload_len = - htons(frag->len - - sizeof(struct ipv6hdr)); - ip6_copy_metadata(frag, skb); - } + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip6_fraglist_next(&iter); } - kfree(tmp_hdr); + kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -753,7 +871,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag_list); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); @@ -770,91 +888,26 @@ slow_path_clean: } slow_path: - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - troom = rt->dst.dev->needed_tailroom; + ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, + LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, + &state); /* * Keep copying data until we run out. */ - while (left > 0) { - u8 *fragnexthdr_offset; - - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - /* Allocate buffer */ - frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC); - if (!frag) { - err = -ENOMEM; + while (state.left > 0) { + frag = ip6_frag_next(skb, &state); + if (IS_ERR(frag)) { + err = PTR_ERR(frag); goto fail; } /* - * Set up data on packet - */ - - ip6_copy_metadata(frag, skb); - skb_reserve(frag, hroom); - skb_put(frag, len + hlen + sizeof(struct frag_hdr)); - skb_reset_network_header(frag); - fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); - frag->transport_header = (frag->network_header + hlen + - sizeof(struct frag_hdr)); - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - if (skb->sk) - skb_set_owner_w(frag, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); - - fragnexthdr_offset = skb_network_header(frag); - fragnexthdr_offset += prevhdr - skb_network_header(skb); - *fragnexthdr_offset = NEXTHDR_FRAGMENT; - - /* - * Build fragment header. - */ - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->identification = frag_id; - - /* - * Copy a block of the IP datagram. - */ - BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), - len)); - left -= len; - - fh->frag_off = htons(offset); - if (left > 0) - fh->frag_off |= htons(IP6_MF); - ipv6_hdr(frag)->payload_len = htons(frag->len - - sizeof(struct ipv6hdr)); - - ptr += len; - offset += len; - - /* * Put this fragment into the sending queue. */ err = output(net, sk, frag); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 4c8e2ea8bf19..f874dde1ee85 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1293,8 +1293,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, @@ -1323,8 +1323,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 1240ccd57f39..9530cc280953 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -16,6 +16,9 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/netfilter/nf_queue.h> +#include <net/netfilter/nf_conntrack_bridge.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include "../bridge/br_private.h" int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { @@ -109,6 +112,122 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, } EXPORT_SYMBOL_GPL(__nf_ip6_route); +int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + struct ip6_frag_state state; + u8 *prevhdr, nexthdr = 0; + unsigned int mtu, hlen; + int hroom, err = 0; + __be32 frag_id; + + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + goto blackhole; + hlen = err; + nexthdr = *prevhdr; + + mtu = skb->dev->mtu; + if (frag_max_size > mtu || + frag_max_size < IPV6_MIN_MTU) + goto blackhole; + + mtu = frag_max_size; + if (mtu < hlen + sizeof(struct frag_hdr) + 8) + goto blackhole; + mtu -= hlen + sizeof(struct frag_hdr); + + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + hroom = LL_RESERVED_SPACE(skb->dev); + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag2) { + if (frag2->len > mtu || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + goto blackhole; + + /* Partially cloned skb? */ + if (skb_shared(frag2)) + goto slow_path; + } + + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) + goto blackhole; + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. + */ + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip6_fraglist_next(&iter); + } + + kfree(iter.tmp_hdr); + if (!err) + return 0; + + kfree_skb_list(iter.frag_list); + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom, + LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id, + &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip6_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(br_ip6_fragment); + static const struct nf_ipv6_ops ipv6ops = { #if IS_MODULE(CONFIG_IPV6) .chk_addr = ipv6_chk_addr, @@ -119,6 +238,10 @@ static const struct nf_ipv6_ops ipv6ops = { .route_input = ip6_route_input, .fragment = ip6_fragment, .reroute = nf_ip6_reroute, +#if IS_MODULE(CONFIG_IPV6) + .br_defrag = nf_ct_frag6_gather, + .br_fragment = br_ip6_fragment, +#endif }; int __init ipv6_netfilter_init(void) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3de0e9b0a482..c5d59fa568d6 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -58,26 +58,21 @@ static struct inet_frags nf_frags; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", - .data = &init_net.nf_frag.frags.timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", - .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", - .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -93,15 +88,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net) GFP_KERNEL); if (table == NULL) goto err_alloc; - - table[0].data = &net->nf_frag.frags.timeout; - table[1].data = &net->nf_frag.frags.low_thresh; - table[1].extra2 = &net->nf_frag.frags.high_thresh; - table[2].data = &net->nf_frag.frags.high_thresh; - table[2].extra1 = &net->nf_frag.frags.low_thresh; - table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } + table[0].data = &net->nf_frag.fqdir->timeout; + table[1].data = &net->nf_frag.fqdir->low_thresh; + table[1].extra2 = &net->nf_frag.fqdir->high_thresh; + table[2].data = &net->nf_frag.fqdir->high_thresh; + table[2].extra1 = &net->nf_frag.fqdir->low_thresh; + table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh; + hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) goto err_reg; @@ -148,12 +143,10 @@ static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ @@ -169,7 +162,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->nf_frag.frags, &key); + q = inet_frag_find(net->nf_frag.fqdir, &key); if (!q) return NULL; @@ -276,7 +269,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -496,24 +489,24 @@ static int nf_ct_net_init(struct net *net) { int res; - net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - net->nf_frag.frags.f = &nf_frags; - - res = inet_frags_init_net(&net->nf_frag.frags); + res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net); if (res < 0) return res; + + net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = nf_ct_frag6_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); return res; } static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); } static struct pernet_operations nf_ct_net_ops = { diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 2356b4af7309..0bbefc440bcd 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -48,8 +48,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", - atomic_read(&net->ipv6.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv6.frags)); + atomic_read(&net->ipv6.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv6.fqdir)); return 0; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 1a832f5e190b..ff5b6d8de2c6 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -76,12 +76,10 @@ static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, ipv6.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * @@ -100,7 +98,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) IPV6_ADDR_LINKLOCAL))) key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &key); + q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; @@ -200,7 +198,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) @@ -254,7 +252,7 @@ err: static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(fq->q.net, struct net, ipv6.frags); + struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; @@ -401,23 +399,18 @@ static const struct inet6_protocol frag_protocol = { static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", - .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", - .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", - .data = &init_net.ipv6.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -449,12 +442,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv6.frags.high_thresh; - table[0].extra1 = &net->ipv6.frags.low_thresh; - table[1].data = &net->ipv6.frags.low_thresh; - table[1].extra2 = &net->ipv6.frags.high_thresh; - table[2].data = &net->ipv6.frags.timeout; } + table[0].data = &net->ipv6.fqdir->high_thresh; + table[0].extra1 = &net->ipv6.fqdir->low_thresh; + table[1].data = &net->ipv6.fqdir->low_thresh; + table[1].extra2 = &net->ipv6.fqdir->high_thresh; + table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl(net, "net/ipv6", table); if (!hdr) @@ -517,25 +510,24 @@ static int __net_init ipv6_frags_init_net(struct net *net) { int res; - net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - net->ipv6.frags.f = &ip6_frags; - - res = inet_frags_init_net(&net->ipv6.frags); + res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; + net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = ip6_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); return res; } static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { @@ -591,8 +583,8 @@ err_protocol: void ipv6_frag_exit(void) { - inet_frags_fini(&ip6_frags); ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + inet_frags_fini(&ip6_frags); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 848e944f07df..fada5a13bcb2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -441,12 +441,12 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { - const struct fib6_nh *nh = &sibling->fib6_nh; + const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); @@ -460,7 +460,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, out: res->f6i = match; - res->nh = &match->fib6_nh; + res->nh = match->fib6_nh; } /* @@ -496,13 +496,13 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { - nh = &f6i->fib6_nh; + nh = f6i->fib6_nh; if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { - nh = &spf6i->fib6_nh; + nh = spf6i->fib6_nh; if (__rt6_device_match(net, nh, saddr, oif, flags)) { res->f6i = spf6i; goto out; @@ -511,14 +511,14 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; goto out; } - nh = &f6i->fib6_nh; + nh = f6i->fib6_nh; if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; } out: res->nh = nh; @@ -714,7 +714,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, if (fib6_check_expired(f6i)) continue; - nh = &f6i->fib6_nh; + nh = f6i->fib6_nh; if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { res->f6i = f6i; res->nh = nh; @@ -796,7 +796,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif, out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; - res->nh = &res->f6i->fib6_nh; + res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } @@ -1270,7 +1270,7 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt, **p; - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); pcpu_rt = *p; if (pcpu_rt) @@ -1291,7 +1291,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, } dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1461,25 +1461,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res) return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } +#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL + +/* used when the flushed bit is not relevant, only access to the bucket + * (ie., all bucket users except rt6_insert_exception); + * + * called under rcu lock; sometimes called with rt6_exception_lock held + */ +static +struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + + if (lock) + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + else + bucket = rcu_dereference(nh->rt6i_exception_bucket); + + /* remove bucket flushed bit if set */ + if (bucket) { + unsigned long p = (unsigned long)bucket; + + p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + } + + return bucket; +} + +static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) +{ + unsigned long p = (unsigned long)bucket; + + return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); +} + +/* called with rt6_exception_lock held */ +static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + unsigned long p; + + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + + p = (unsigned long)bucket; + p |= FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); +} + static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; + struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *f6i = res->f6i; + struct fib6_nh *nh = res->nh; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (f6i->exception_bucket_flushed) { - err = -EINVAL; - goto out; - } - - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); @@ -1487,7 +1536,10 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); + } else if (fib6_nh_excptn_bucket_flushed(bucket)) { + err = -EINVAL; + goto out; } #ifdef CONFIG_IPV6_SUBTREES @@ -1542,7 +1594,7 @@ out: return err; } -void rt6_flush_exceptions(struct fib6_info *rt) +static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1550,25 +1602,33 @@ void rt6_flush_exceptions(struct fib6_info *rt) int i; spin_lock_bh(&rt6_exception_lock); - /* Prevent rt6_insert_exception() to recreate the bucket list */ - rt->exception_bucket_flushed = 1; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; + /* Prevent rt6_insert_exception() to recreate the bucket list */ + if (!from) + fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) - rt6_remove_exception(bucket, rt6_ex); - WARN_ON_ONCE(bucket->depth); + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { + if (!from || + rcu_access_pointer(rt6_ex->rt6i->from) == from) + rt6_remove_exception(bucket, rt6_ex); + } + WARN_ON_ONCE(!from && bucket->depth); bucket++; } - out: spin_unlock_bh(&rt6_exception_lock); } +void rt6_flush_exceptions(struct fib6_info *f6i) +{ + fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); +} + /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ @@ -1597,7 +1657,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, src_key = saddr; find_ex: #endif - bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); + bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) @@ -1615,25 +1675,20 @@ find_ex: } /* Remove the passed in cached rt from the hash table that contains it */ -static int rt6_remove_exception_rt(struct rt6_info *rt) +static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *from; int err; - from = rcu_dereference(rt->from); - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; - - if (!rcu_access_pointer(from->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(from->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); + #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1641,7 +1696,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1658,23 +1713,29 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) return err; } -/* Find rt6_ex which contains the passed in rt cache and - * refresh its stamp - */ -static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +static int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; - struct rt6_exception *rt6_ex; struct fib6_info *from; - rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) - goto unlock; + return -EINVAL; - bucket = rcu_dereference(from->rt6i_exception_bucket); + return fib6_nh_remove_exception(from->fib6_nh, + from->fib6_src.plen, rt); +} +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) +{ + const struct in6_addr *src_key = NULL; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1682,15 +1743,25 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif - rt6_ex = __rt6_find_exception_rcu(&bucket, - &rt->rt6i_dst.addr, - src_key); + rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; +} +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct fib6_info *from; + + rcu_read_lock(); + + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + + fib6_nh_update_exception(from->fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1718,15 +1789,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct fib6_info *rt, int mtu) + const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; @@ -1748,21 +1817,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct fib6_info *rt, - struct in6_addr *gateway) +static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, + const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1827,23 +1894,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct fib6_info *rt, - struct fib6_gc_args *gc_args, - unsigned long now) +static void fib6_nh_age_exceptions(const struct fib6_nh *nh, + struct fib6_gc_args *gc_args, + unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1858,6 +1923,13 @@ void rt6_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } +void rt6_age_exceptions(struct fib6_info *f6i, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); +} + /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) @@ -2384,7 +2456,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_unlock(); return; } - res.nh = &res.f6i->fib6_nh; + res.nh = res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; @@ -2533,7 +2605,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; if (fib6_check_expired(rt)) continue; @@ -2557,7 +2629,7 @@ restart: } res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); @@ -3074,6 +3146,12 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, !netif_carrier_ok(dev)) fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) { + err = -ENOMEM; + goto out; + } + err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) @@ -3098,6 +3176,38 @@ out: void fib6_nh_release(struct fib6_nh *fib6_nh) { + struct rt6_exception_bucket *bucket; + + rcu_read_lock(); + + fib6_nh_flush_exceptions(fib6_nh, NULL); + bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); + if (bucket) { + rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); + kfree(bucket); + } + + rcu_read_unlock(); + + if (fib6_nh->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(fib6_nh->rt6i_pcpu); + } + fib_nh_common_release(&fib6_nh->nh_common); } @@ -3160,7 +3270,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags); + rt = fib6_info_alloc(gfp_flags, true); if (!rt) goto out; @@ -3200,7 +3310,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) goto out; @@ -3208,7 +3318,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, * they would result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type)) rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; if (!ipv6_addr_any(&cfg->fc_prefsrc)) { @@ -3326,7 +3436,7 @@ out_put: return err; } -static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; @@ -3342,10 +3452,25 @@ out: return rc; } +static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, + struct fib6_nh *nh) +{ + struct fib6_result res = { + .f6i = rt, + .nh = nh, + }; + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); + if (rt_cache) + return __ip6_del_cached_rt(rt_cache, cfg); + + return 0; +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_cache; struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; @@ -3368,26 +3493,18 @@ static int ip6_route_del(struct fib6_config *cfg, for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; + nh = rt->fib6_nh; if (cfg->fc_flags & RTF_CACHE) { - struct fib6_result res = { - .f6i = rt, - }; int rc; - rt_cache = rt6_find_cached_rt(&res, - &cfg->fc_dst, - &cfg->fc_src); - if (rt_cache) { - rc = ip6_del_cached_rt(rt_cache, cfg); - if (rc != -ESRCH) { - rcu_read_unlock(); - return rc; - } + rc = ip6_del_cached_rt(cfg, rt, nh); + if (rc != -ESRCH) { + rcu_read_unlock(); + return rc; } continue; } - nh = &rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) @@ -3509,7 +3626,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (!res.f6i) goto out; - res.nh = &res.f6i->fib6_nh; + res.nh = res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); @@ -3561,12 +3678,12 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) + if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_gw_family) + !rt->fib6_nh->fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3624,7 +3741,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - struct fib6_nh *nh = &rt->fib6_nh; + struct fib6_nh *nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && @@ -3876,7 +3993,7 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && + if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3904,18 +4021,17 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + struct fib6_nh *nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_gw_family && - ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { + nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; - } /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ - rt6_exceptions_clean_tohost(rt, gateway); + fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } @@ -3955,9 +4071,9 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && - ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) + if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; @@ -3969,11 +4085,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.fib_nh_weight; + total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.fib_nh_weight; + total += iter->fib6_nh->fib_nh_weight; } return total; @@ -3984,11 +4100,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.fib_nh_weight; + *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -4032,8 +4148,8 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) struct net *net = dev_net(arg->dev); if (rt != net->ipv6.fib6_null_entry && - rt->fib6_nh.fib_nh_dev == arg->dev) { - rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; + rt->fib6_nh->fib_nh_dev == arg->dev) { + rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -4061,10 +4177,10 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) + if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) + if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; @@ -4085,12 +4201,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.fib_nh_dev == down_dev || - rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh->fib_nh_dev == down_dev || + rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == down_dev || - iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh->fib_nh_dev == down_dev || + iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -4102,11 +4218,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) - rt->fib6_nh.fib_nh_flags |= nh_flags; + if (rt->fib6_nh->fib_nh_dev == dev) + rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) - iter->fib6_nh.fib_nh_flags |= nh_flags; + if (iter->fib6_nh->fib_nh_dev == dev) + iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4121,12 +4237,12 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4142,10 +4258,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.fib_nh_dev != dev || + if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4179,9 +4295,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event) struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; + struct fib6_info *f6i; }; -static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) +static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; + struct fib6_info *f6i = arg->f6i; + + /* For administrative MTU increase, there is no way to discover + * IPv6 PMTU increase, so PMTU increase should be updated here. + * Since RFC 1981 doesn't include administrative MTU increase + * update PMTU increase is a MUST. (i.e. jumbo frame) + */ + if (nh->fib_nh_dev == arg->dev) { + struct inet6_dev *idev = __in6_dev_get(arg->dev); + u32 mtu = f6i->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(f6i, RTAX_MTU, arg->mtu); + + spin_lock_bh(&rt6_exception_lock); + rt6_exceptions_update_pmtu(idev, nh, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); + } + + return 0; +} + +static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4196,24 +4339,11 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) if (!idev) return 0; - /* For administrative MTU increase, there is no way to discover - IPv6 PMTU increase, so PMTU increase should be updated here. - Since RFC 1981 doesn't include administrative MTU increase - update PMTU increase is a MUST. (i.e. jumbo frame) - */ - if (rt->fib6_nh.fib_nh_dev == arg->dev && - !fib6_metric_locked(rt, RTAX_MTU)) { - u32 mtu = rt->fib6_pmtu; - - if (mtu >= arg->mtu || - (mtu < arg->mtu && mtu == idev->cnf.mtu6)) - fib6_metric_set(rt, RTAX_MTU, arg->mtu); + if (fib6_metric_locked(f6i, RTAX_MTU)) + return 0; - spin_lock_bh(&rt6_exception_lock); - rt6_exceptions_update_pmtu(idev, rt, arg->mtu); - spin_unlock_bh(&rt6_exception_lock); - } - return 0; + arg->f6i = f6i; + return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) @@ -4227,6 +4357,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu) } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, @@ -4492,7 +4623,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4659,7 +4790,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); + + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws); nexthop_len *= rt->fib6_nsiblings; } @@ -4677,7 +4808,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws) + nexthop_len; } @@ -4797,14 +4928,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (!mp) goto nla_put_failure; - if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, - rt->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, + rt->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, - sibling->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, + sibling->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; } @@ -4812,7 +4943,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, } else { unsigned char nh_flags = 0; - if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, &nh_flags, false) < 0) goto nla_put_failure; @@ -4842,7 +4973,7 @@ nla_put_failure: static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.fib_nh_dev == dev) + if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4850,7 +4981,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.fib_nh_dev == dev) + if (sibling->fib6_nh->fib_nh_dev == dev) return true; } } @@ -5129,6 +5260,38 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } +void fib6_rt_update(struct net *net, struct fib6_info *rt, + struct nl_info *info) +{ + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* call_fib6_entry_notifiers will be removed when in-kernel notifier + * is implemented and supported for nexthop objects + */ + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); + + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + if (!skb) + goto errout; + + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} + static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -5139,7 +5302,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5333,11 +5496,11 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; - net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, - sizeof(*net->ipv6.fib6_null_entry), - GFP_KERNEL); + net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; + memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), @@ -5474,7 +5637,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES |