summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2017-06-18 04:54:01 +0200
committerDavid S. Miller <davem@davemloft.net>2017-06-18 04:54:01 +0200
commitffe95ecf3a2e060cc95ad8835a71273a4a3b3923 (patch)
tree84d1cd1b7a6b3990749e2423a6a2a38a2478acd7 /net
parentMerge tag 'mlx5-updates-2017-06-16' of git://git.kernel.org/pub/scm/linux/ker... (diff)
parentnet: add debug atomic_inc_not_zero() in dst_hold() (diff)
downloadlinux-ffe95ecf3a2e060cc95ad8835a71273a4a3b3923.tar.xz
linux-ffe95ecf3a2e060cc95ad8835a71273a4a3b3923.zip
Merge branch 'net-remove-dst-garbage-collector-logic'
Wei Wang says: ==================== remove dst garbage collector logic The current mechanism of dst release is a bit complicated. It is because the users of dst get divided into 2 situations: 1. Most users take the reference count when using a dst and release the reference count when done. 2. Exceptional users like IPv4/IPv6/decnet/xfrm routing code do not take reference count when referencing to a dst due to some histotic reasons. Due to those exceptional use cases in 2, reference count being 0 is not an adequate evidence to indicate that no user is using this dst. So users in 1 can't free the dst simply based on reference count being 0 because users in 2 might still hold reference to it. Instead, a dst garbage list is needed to hold the dst entries that already get removed by the users in 2 but are still held by users in 1. And a periodic garbage collector task is run to check all the dst entries in the list to see if the users in 1 have released the reference to those dst entries. If so, the dst is now ready to be freed. This logic introduces unnecessary complications in the dst code which makes it hard to understand and to debug. In order to get rid of the whole dst garbage collector (gc) and make the dst code more unified and simplified, we can make the users in 2 also take reference count on the dst and release it properly when done. This way, dst can be safely freed once the refcount drops to 0 and no gc thread is needed anymore. This patch series' target is to completely get rid of dst gc logic and free dst based on reference count only. Patch 1-3 are preparation patches to do some cleanup/improvement on the existing code to make later work easier. Patch 4-21 are real implementations. In these patches, a temporary flag DST_NOGC is used to help transition those exceptional users one by one. Once every component is transitioned, this temporary flag is removed. By the end of this patch series, all dst are refcounted when being used and released when done. And dst will be freed when its refcount drops to 0. No dst gc task is running anymore. Note: This patch series depends on the decnet fix that was sent right before: "decnet: always not take dst->__refcnt when inserting dst into hash table" v2: add curly braces in udp_v4/6_early_demux() in patch 02 add EXPORT_SYMBOL() for dst_dev_put() in patch 05 ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/dst.c276
-rw-r--r--net/decnet/dn_route.c34
-rw-r--r--net/ipv4/fib_semantics.c9
-rw-r--r--net/ipv4/route.c62
-rw-r--r--net/ipv4/udp.c19
-rw-r--r--net/ipv6/addrconf.c4
-rw-r--r--net/ipv6/ip6_fib.c32
-rw-r--r--net/ipv6/ip6_output.c4
-rw-r--r--net/ipv6/route.c127
-rw-r--r--net/ipv6/udp.c11
-rw-r--r--net/xfrm/xfrm_policy.c49
12 files changed, 203 insertions, 425 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index b8d6dd9e8b5c..5d1830b8d2cf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8681,7 +8681,6 @@ static int __init net_dev_init(void)
rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
NULL, dev_cpu_dead);
WARN_ON(rc < 0);
- dst_subsys_init();
rc = 0;
out:
return rc;
diff --git a/net/core/dst.c b/net/core/dst.c
index 13ba4a090c41..f851adb9ec9b 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -42,108 +42,6 @@
* to dirty as few cache lines as possible in __dst_free().
* As this is not a very strong hint, we dont force an alignment on SMP.
*/
-static struct {
- spinlock_t lock;
- struct dst_entry *list;
- unsigned long timer_inc;
- unsigned long timer_expires;
-} dst_garbage = {
- .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
- .timer_inc = DST_GC_MAX,
-};
-static void dst_gc_task(struct work_struct *work);
-static void ___dst_free(struct dst_entry *dst);
-
-static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
-
-static DEFINE_MUTEX(dst_gc_mutex);
-/*
- * long lived entries are maintained in this list, guarded by dst_gc_mutex
- */
-static struct dst_entry *dst_busy_list;
-
-static void dst_gc_task(struct work_struct *work)
-{
- int delayed = 0;
- int work_performed = 0;
- unsigned long expires = ~0L;
- struct dst_entry *dst, *next, head;
- struct dst_entry *last = &head;
-
- mutex_lock(&dst_gc_mutex);
- next = dst_busy_list;
-
-loop:
- while ((dst = next) != NULL) {
- next = dst->next;
- prefetch(&next->next);
- cond_resched();
- if (likely(atomic_read(&dst->__refcnt))) {
- last->next = dst;
- last = dst;
- delayed++;
- continue;
- }
- work_performed++;
-
- dst = dst_destroy(dst);
- if (dst) {
- /* NOHASH and still referenced. Unless it is already
- * on gc list, invalidate it and add to gc list.
- *
- * Note: this is temporary. Actually, NOHASH dst's
- * must be obsoleted when parent is obsoleted.
- * But we do not have state "obsoleted, but
- * referenced by parent", so it is right.
- */
- if (dst->obsolete > 0)
- continue;
-
- ___dst_free(dst);
- dst->next = next;
- next = dst;
- }
- }
-
- spin_lock_bh(&dst_garbage.lock);
- next = dst_garbage.list;
- if (next) {
- dst_garbage.list = NULL;
- spin_unlock_bh(&dst_garbage.lock);
- goto loop;
- }
- last->next = NULL;
- dst_busy_list = head.next;
- if (!dst_busy_list)
- dst_garbage.timer_inc = DST_GC_MAX;
- else {
- /*
- * if we freed less than 1/10 of delayed entries,
- * we can sleep longer.
- */
- if (work_performed <= delayed/10) {
- dst_garbage.timer_expires += dst_garbage.timer_inc;
- if (dst_garbage.timer_expires > DST_GC_MAX)
- dst_garbage.timer_expires = DST_GC_MAX;
- dst_garbage.timer_inc += DST_GC_INC;
- } else {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- }
- expires = dst_garbage.timer_expires;
- /*
- * if the next desired timer is more than 4 seconds in the
- * future then round the timer to whole seconds
- */
- if (expires > 4*HZ)
- expires = round_jiffies_relative(expires);
- schedule_delayed_work(&dst_gc_work, expires);
- }
-
- spin_unlock_bh(&dst_garbage.lock);
- mutex_unlock(&dst_gc_mutex);
-}
-
int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
@@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
}
EXPORT_SYMBOL(dst_alloc);
-static void ___dst_free(struct dst_entry *dst)
-{
- /* The first case (dev==NULL) is required, when
- protocol module is unloaded.
- */
- if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- }
- dst->obsolete = DST_OBSOLETE_DEAD;
-}
-
-void __dst_free(struct dst_entry *dst)
-{
- spin_lock_bh(&dst_garbage.lock);
- ___dst_free(dst);
- dst->next = dst_garbage.list;
- dst_garbage.list = dst;
- if (dst_garbage.timer_inc > DST_GC_INC) {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- mod_delayed_work(system_wq, &dst_gc_work,
- dst_garbage.timer_expires);
- }
- spin_unlock_bh(&dst_garbage.lock);
-}
-EXPORT_SYMBOL(__dst_free);
-
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct dst_entry *child;
smp_rmb();
-again:
child = dst->child;
if (!(dst->flags & DST_NOCOUNT))
@@ -269,20 +138,8 @@ again:
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
- if (dst) {
- int nohash = dst->flags & DST_NOHASH;
-
- if (atomic_dec_and_test(&dst->__refcnt)) {
- /* We were real parent of this dst, so kill child. */
- if (nohash)
- goto again;
- } else {
- /* Child is still referenced, return it for freeing. */
- if (nohash)
- return dst;
- /* Child is still in his hash table */
- }
- }
+ if (dst)
+ dst_release_immediate(dst);
return NULL;
}
EXPORT_SYMBOL(dst_destroy);
@@ -292,26 +149,62 @@ static void dst_destroy_rcu(struct rcu_head *head)
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
dst = dst_destroy(dst);
- if (dst)
- __dst_free(dst);
}
+/* Operations to mark dst as DEAD and clean up the net device referenced
+ * by dst:
+ * 1. put the dst under loopback interface and discard all tx/rx packets
+ * on this route.
+ * 2. release the net_device
+ * This function should be called when removing routes from the fib tree
+ * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
+ * make the next dst_ops->check() fail.
+ */
+void dst_dev_put(struct dst_entry *dst)
+{
+ struct net_device *dev = dst->dev;
+
+ dst->obsolete = DST_OBSOLETE_DEAD;
+ if (dst->ops->ifdown)
+ dst->ops->ifdown(dst, dev, true);
+ dst->input = dst_discard;
+ dst->output = dst_discard_out;
+ dst->dev = dev_net(dst->dev)->loopback_dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+}
+EXPORT_SYMBOL(dst_dev_put);
+
void dst_release(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
- unsigned short nocache = dst->flags & DST_NOCACHE;
newrefcnt = atomic_dec_return(&dst->__refcnt);
if (unlikely(newrefcnt < 0))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
- if (!newrefcnt && unlikely(nocache))
+ if (!newrefcnt)
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
+void dst_release_immediate(struct dst_entry *dst)
+{
+ if (dst) {
+ int newrefcnt;
+
+ newrefcnt = atomic_dec_return(&dst->__refcnt);
+ if (unlikely(newrefcnt < 0))
+ net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+ __func__, dst, newrefcnt);
+ if (!newrefcnt)
+ dst_destroy(dst);
+ }
+}
+EXPORT_SYMBOL(dst_release_immediate);
+
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
@@ -377,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
dst = &md_dst->dst;
dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
- DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
+ DST_METADATA | DST_NOCOUNT);
dst->input = dst_md_discard;
dst->output = dst_md_discard_out;
@@ -423,86 +316,3 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
-
-/* Dirty hack. We did it in 2.2 (in __dst_free),
- * we have _very_ good reasons not to repeat
- * this mistake in 2.3, but we have no choice
- * now. _It_ _is_ _explicit_ _deliberate_
- * _race_ _condition_.
- *
- * Commented and originally written by Alexey.
- */
-static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
-{
- if (dst->ops->ifdown)
- dst->ops->ifdown(dst, dev, unregister);
-
- if (dev != dst->dev)
- return;
-
- if (!unregister) {
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- } else {
- dst->dev = dev_net(dst->dev)->loopback_dev;
- dev_hold(dst->dev);
- dev_put(dev);
- }
-}
-
-static int dst_dev_event(struct notifier_block *this, unsigned long event,
- void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct dst_entry *dst, *last = NULL;
-
- switch (event) {
- case NETDEV_UNREGISTER_FINAL:
- case NETDEV_DOWN:
- mutex_lock(&dst_gc_mutex);
- for (dst = dst_busy_list; dst; dst = dst->next) {
- last = dst;
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- }
-
- spin_lock_bh(&dst_garbage.lock);
- dst = dst_garbage.list;
- dst_garbage.list = NULL;
- /* The code in dst_ifdown places a hold on the loopback device.
- * If the gc entry processing is set to expire after a lengthy
- * interval, this hold can cause netdev_wait_allrefs() to hang
- * out and wait for a long time -- until the the loopback
- * interface is released. If we're really unlucky, it'll emit
- * pr_emerg messages to console too. Reset the interval here,
- * so dst cleanups occur in a more timely fashion.
- */
- if (dst_garbage.timer_inc > DST_GC_INC) {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- mod_delayed_work(system_wq, &dst_gc_work,
- dst_garbage.timer_expires);
- }
- spin_unlock_bh(&dst_garbage.lock);
-
- if (last)
- last->next = dst;
- else
- dst_busy_list = dst;
- for (; dst; dst = dst->next)
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- mutex_unlock(&dst_gc_mutex);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block dst_dev_notifier = {
- .notifier_call = dst_dev_event,
- .priority = -10, /* must be called after other network notifiers */
-};
-
-void __init dst_subsys_init(void)
-{
- register_netdevice_notifier(&dst_dev_notifier);
-}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6f95612b4d32..5d17d843ac86 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -183,11 +183,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
return dn_rt_hash_mask & (unsigned int)tmp;
}
-static inline void dnrt_free(struct dn_route *rt)
-{
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static void dn_dst_check_expire(unsigned long dummy)
{
int i;
@@ -202,14 +197,15 @@ static void dn_dst_check_expire(unsigned long dummy)
spin_lock(&dn_rt_hash_table[i].lock);
while ((rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
- if (atomic_read(&rt->dst.__refcnt) ||
- (now - rt->dst.lastuse) < expire) {
+ if (atomic_read(&rt->dst.__refcnt) > 1 ||
+ (now - rt->dst.lastuse) < expire) {
rtp = &rt->dst.dn_next;
continue;
}
*rtp = rt->dst.dn_next;
rt->dst.dn_next = NULL;
- dnrt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
spin_unlock(&dn_rt_hash_table[i].lock);
@@ -235,14 +231,15 @@ static int dn_dst_gc(struct dst_ops *ops)
while ((rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
- if (atomic_read(&rt->dst.__refcnt) ||
- (now - rt->dst.lastuse) < expire) {
+ if (atomic_read(&rt->dst.__refcnt) > 1 ||
+ (now - rt->dst.lastuse) < expire) {
rtp = &rt->dst.dn_next;
continue;
}
*rtp = rt->dst.dn_next;
rt->dst.dn_next = NULL;
- dnrt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
break;
}
spin_unlock_bh(&dn_rt_hash_table[i].lock);
@@ -344,7 +341,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
dst_use(&rth->dst, now);
spin_unlock_bh(&dn_rt_hash_table[hash].lock);
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
*rp = rth;
return 0;
}
@@ -374,7 +371,8 @@ static void dn_run_flush(unsigned long dummy)
for(; rt; rt = next) {
next = rcu_dereference_raw(rt->dst.dn_next);
RCU_INIT_POINTER(rt->dst.dn_next, NULL);
- dnrt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
nothing_to_declare:
@@ -1181,7 +1179,7 @@ make_route:
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
@@ -1215,6 +1213,7 @@ make_route:
goto e_neighbour;
hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
+ /* dn_insert_route() increments dst->__refcnt */
dn_insert_route(rt, hash, (struct dn_route **)pprt);
done:
@@ -1237,7 +1236,7 @@ e_nobufs:
err = -ENOBUFS;
goto done;
e_neighbour:
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
goto e_nobufs;
}
@@ -1445,7 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
}
make_route:
- rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
@@ -1491,6 +1490,7 @@ make_route:
goto e_neighbour;
hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
+ /* dn_insert_route() increments dst->__refcnt */
dn_insert_route(rt, hash, &rt);
skb_dst_set(skb, &rt->dst);
@@ -1514,7 +1514,7 @@ e_nobufs:
goto done;
e_neighbour:
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
goto done;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2157dc08c407..ff47ea1408fe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -152,7 +152,8 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp)
* free_fib_info_rcu()
*/
- dst_free(&rt->dst);
+ dst_dev_put(&rt->dst);
+ dst_release_immediate(&rt->dst);
}
static void free_nh_exceptions(struct fib_nh *nh)
@@ -194,8 +195,10 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
struct rtable *rt;
rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
- if (rt)
- dst_free(&rt->dst);
+ if (rt) {
+ dst_dev_put(&rt->dst);
+ dst_release_immediate(&rt->dst);
+ }
}
free_percpu(rtp);
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9b38cf18144e..c816cd53f7fc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -589,11 +589,6 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
build_sk_flow_key(fl4, sk);
}
-static inline void rt_free(struct rtable *rt)
-{
- call_rcu(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static DEFINE_SPINLOCK(fnhe_lock);
static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
@@ -603,12 +598,14 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
- rt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
- rt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
}
@@ -1302,7 +1299,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
}
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
- __be32 daddr)
+ __be32 daddr, const bool do_cache)
{
bool ret = false;
@@ -1331,10 +1328,13 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
- if (!(rt->dst.flags & DST_NOCACHE)) {
+ if (do_cache) {
+ dst_hold(&rt->dst);
rcu_assign_pointer(*porig, rt);
- if (orig)
- rt_free(orig);
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
ret = true;
}
@@ -1357,12 +1357,20 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
}
orig = *p;
+ /* hold dst before doing cmpxchg() to avoid race condition
+ * on this dst
+ */
+ dst_hold(&rt->dst);
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
- if (orig)
- rt_free(orig);
- } else
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
+ } else {
+ dst_release(&rt->dst);
ret = false;
+ }
return ret;
}
@@ -1433,7 +1441,8 @@ static bool rt_cache_valid(const struct rtable *rt)
static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
const struct fib_result *res,
struct fib_nh_exception *fnhe,
- struct fib_info *fi, u16 type, u32 itag)
+ struct fib_info *fi, u16 type, u32 itag,
+ const bool do_cache)
{
bool cached = false;
@@ -1454,8 +1463,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#endif
rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (unlikely(fnhe))
- cached = rt_bind_exception(rt, fnhe, daddr);
- else if (!(rt->dst.flags & DST_NOCACHE))
+ cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
+ else if (do_cache)
cached = rt_cache_route(nh, rt);
if (unlikely(!cached)) {
/* Routes we intend to cache in nexthop exception or
@@ -1463,7 +1472,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
* However, if we are unsuccessful at storing this
* route into the cache we really need to set it.
*/
- rt->dst.flags |= DST_NOCACHE;
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
rt_add_uncached_list(rt);
@@ -1486,7 +1494,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (will_cache ? 0 : DST_HOST) |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
@@ -1730,7 +1738,8 @@ rt_cache:
rth->dst.input = ip_forward;
- rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
+ do_cache);
set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
@@ -2018,10 +2027,8 @@ local_input:
rth->dst.input = lwtunnel_input;
}
- if (unlikely(!rt_cache_route(nh, rth))) {
- rth->dst.flags |= DST_NOCACHE;
+ if (unlikely(!rt_cache_route(nh, rth)))
rt_add_uncached_list(rth);
- }
}
skb_dst_set(skb, &rth->dst);
err = 0;
@@ -2217,10 +2224,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rth = rcu_dereference(*prth);
rt_cache:
- if (rt_cache_valid(rth)) {
- dst_hold(&rth->dst);
+ if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
return rth;
- }
}
add:
@@ -2254,7 +2259,7 @@ add:
#endif
}
- rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
set_lwt_redirect(rth);
return rth;
@@ -2504,7 +2509,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->input = dst_discard;
new->output = dst_discard_out;
- new->dev = ort->dst.dev;
+ new->dev = net->loopback_dev;
if (new->dev)
dev_hold(new->dev);
@@ -2519,7 +2524,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_uses_gateway = ort->rt_uses_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
- dst_free(new);
}
dst_release(dst_orig);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2bc638c48b86..f3450f092d71 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1977,9 +1977,10 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
struct dst_entry *old;
- dst_hold(dst);
- old = xchg(&sk->sk_rx_dst, dst);
- dst_release(old);
+ if (dst_hold_safe(dst)) {
+ old = xchg(&sk->sk_rx_dst, dst);
+ dst_release(old);
+ }
}
/*
@@ -2303,13 +2304,11 @@ void udp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst) {
- /* DST_NOCACHE can not be used without taking a reference */
- if (dst->flags & DST_NOCACHE) {
- if (likely(atomic_inc_not_zero(&dst->__refcnt)))
- skb_dst_set(skb, dst);
- } else {
- skb_dst_set_noref(skb, dst);
- }
+ /* set noref for now.
+ * any place which wants to hold dst has to call
+ * dst_hold_safe()
+ */
+ skb_dst_set_noref(skb, dst);
}
}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0aa36b093013..2a6397714d70 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5576,8 +5576,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
ip6_del_rt(rt);
}
if (ifp->rt) {
- dst_hold(&ifp->rt->dst);
- ip6_del_rt(ifp->rt);
+ if (dst_hold_safe(&ifp->rt->dst))
+ ip6_del_rt(ifp->rt);
}
rt_genid_bump_ipv6(net);
break;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index deea901746c8..4f3e4657f2d6 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -153,11 +153,6 @@ static void node_free(struct fib6_node *fn)
kmem_cache_free(fib6_node_kmem, fn);
}
-static void rt6_rcu_free(struct rt6_info *rt)
-{
- call_rcu(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
{
int cpu;
@@ -172,7 +167,8 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
if (pcpu_rt) {
- rt6_rcu_free(pcpu_rt);
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
*ppcpu_rt = NULL;
}
}
@@ -185,7 +181,8 @@ static void rt6_release(struct rt6_info *rt)
{
if (atomic_dec_and_test(&rt->rt6i_ref)) {
rt6_free_pcpu(rt);
- rt6_rcu_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
}
@@ -978,8 +975,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
int replace_required = 0;
int sernum = fib6_new_sernum(info->nl_net);
- if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) &&
- !atomic_read(&rt->dst.__refcnt)))
+ if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
return -EINVAL;
if (info->nlh) {
@@ -1076,7 +1072,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
fib6_start_gc(info->nl_net, rt);
if (!(rt->rt6i_flags & RTF_CACHE))
fib6_prune_clones(info->nl_net, pn);
- rt->dst.flags &= ~DST_NOCACHE;
}
out:
@@ -1101,8 +1096,10 @@ out:
atomic_inc(&pn->leaf->rt6i_ref);
}
#endif
- if (!(rt->dst.flags & DST_NOCACHE))
- dst_free(&rt->dst);
+ /* Always release dst as dst->__refcnt is guaranteed
+ * to be taken before entering this function
+ */
+ dst_release_immediate(&rt->dst);
}
return err;
@@ -1113,8 +1110,10 @@ out:
st_failure:
if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
fib6_repair_tree(info->nl_net, fn);
- if (!(rt->dst.flags & DST_NOCACHE))
- dst_free(&rt->dst);
+ /* Always release dst as dst->__refcnt is guaranteed
+ * to be taken before entering this function
+ */
+ dst_release_immediate(&rt->dst);
return err;
#endif
}
@@ -1783,7 +1782,7 @@ static int fib6_age(struct rt6_info *rt, void *arg)
}
gc_args->more++;
} else if (rt->rt6i_flags & RTF_CACHE) {
- if (atomic_read(&rt->dst.__refcnt) == 0 &&
+ if (atomic_read(&rt->dst.__refcnt) == 1 &&
time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
RT6_TRACE("aging clone %p\n", rt);
return -1;
@@ -1821,8 +1820,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
}
gc_args.timeout = expires ? (int)expires :
net->ipv6.sysctl.ip6_rt_gc_interval;
-
- gc_args.more = icmp6_dst_gc();
+ gc_args.more = 0;
fib6_clean_all(net, fib6_age, &gc_args);
now = jiffies;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8b8efb0e55bf..5baa6fab4b97 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -698,8 +698,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
ipv6_hdr(skb)->payload_len = htons(first_len -
sizeof(struct ipv6hdr));
- dst_hold(&rt->dst);
-
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
@@ -742,7 +740,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
if (err == 0) {
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGOKS);
- ip6_rt_put(rt);
return 0;
}
@@ -750,7 +747,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGFAILS);
- ip6_rt_put(rt);
return err;
slow_path_clean:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 18fe6e2b88d5..2e4490076061 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -128,7 +128,6 @@ static void rt6_uncached_list_add(struct rt6_info *rt)
{
struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
- rt->dst.flags |= DST_NOCACHE;
rt->rt6i_uncached_list = ul;
spin_lock_bh(&ul->lock);
@@ -354,7 +353,7 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
- 0, DST_OBSOLETE_FORCE_CHK, flags);
+ 1, DST_OBSOLETE_FORCE_CHK, flags);
if (rt)
rt6_info_init(rt);
@@ -381,7 +380,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net,
*p = NULL;
}
} else {
- dst_destroy((struct dst_entry *)rt);
+ dst_release_immediate(&rt->dst);
return NULL;
}
}
@@ -932,9 +931,9 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
EXPORT_SYMBOL(rt6_lookup);
/* ip6_ins_rt is called with FREE table->tb6_lock.
- It takes new route entry, the addition fails by any reason the
- route is freed. In any case, if caller does not hold it, it may
- be destroyed.
+ * It takes new route entry, the addition fails by any reason the
+ * route is released.
+ * Caller must hold dst before calling it.
*/
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
@@ -957,6 +956,8 @@ int ip6_ins_rt(struct rt6_info *rt)
struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
struct mx6_config mxc = { .mx = NULL, };
+ /* Hold dst to account for the reference from the fib6 tree */
+ dst_hold(&rt->dst);
return __ip6_ins_rt(rt, &info, &mxc, NULL);
}
@@ -1049,7 +1050,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
prev = cmpxchg(p, NULL, pcpu_rt);
if (prev) {
/* If someone did it before us, return prev instead */
- dst_destroy(&pcpu_rt->dst);
+ dst_release_immediate(&pcpu_rt->dst);
pcpu_rt = prev;
}
} else {
@@ -1059,7 +1060,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
* since rt is going away anyway. The next
* dst_check() will trigger a re-lookup.
*/
- dst_destroy(&pcpu_rt->dst);
+ dst_release_immediate(&pcpu_rt->dst);
pcpu_rt = rt;
}
dst_hold(&pcpu_rt->dst);
@@ -1129,12 +1130,15 @@ redo_rt6_select:
uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
dst_release(&rt->dst);
- if (uncached_rt)
+ if (uncached_rt) {
+ /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
+ * No need for another dst_hold()
+ */
rt6_uncached_list_add(uncached_rt);
- else
+ } else {
uncached_rt = net->ipv6.ip6_null_entry;
-
- dst_hold(&uncached_rt->dst);
+ dst_hold(&uncached_rt->dst);
+ }
trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
return uncached_rt;
@@ -1245,9 +1249,11 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
+ struct net_device *loopback_dev = net->loopback_dev;
struct dst_entry *new = NULL;
- rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
+ rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
+ DST_OBSOLETE_NONE, 0);
if (rt) {
rt6_info_init(rt);
@@ -1257,10 +1263,8 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
new->output = dst_discard_out;
dst_copy_metrics(new, &ort->dst);
- rt->rt6i_idev = ort->rt6i_idev;
- if (rt->rt6i_idev)
- in6_dev_hold(rt->rt6i_idev);
+ rt->rt6i_idev = in6_dev_get(loopback_dev);
rt->rt6i_gateway = ort->rt6i_gateway;
rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
rt->rt6i_metric = 0;
@@ -1269,8 +1273,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
#ifdef CONFIG_IPV6_SUBTREES
memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
-
- dst_free(new);
}
dst_release(dst_orig);
@@ -1323,7 +1325,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
rt6_dst_from_metrics_check(rt);
if (rt->rt6i_flags & RTF_PCPU ||
- (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
+ (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
return rt6_dst_from_check(rt, cookie);
else
return rt6_check(rt, cookie);
@@ -1356,8 +1358,8 @@ static void ip6_link_failure(struct sk_buff *skb)
rt = (struct rt6_info *) skb_dst(skb);
if (rt) {
if (rt->rt6i_flags & RTF_CACHE) {
- dst_hold(&rt->dst);
- ip6_del_rt(rt);
+ if (dst_hold_safe(&rt->dst))
+ ip6_del_rt(rt);
} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
rt->rt6i_node->fn_sernum = -1;
}
@@ -1421,6 +1423,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
* invalidate the sk->sk_dst_cache.
*/
ip6_ins_rt(nrt6);
+ /* Release the reference taken in
+ * ip6_rt_cache_alloc()
+ */
+ dst_release(&nrt6->dst);
}
}
}
@@ -1649,9 +1655,6 @@ out:
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
-static struct dst_entry *icmp6_dst_gc_list;
-static DEFINE_SPINLOCK(icmp6_dst_lock);
-
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
struct flowi6 *fl6)
{
@@ -1672,19 +1675,16 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
rt->dst.flags |= DST_HOST;
rt->dst.output = ip6_output;
- atomic_set(&rt->dst.__refcnt, 1);
rt->rt6i_gateway = fl6->daddr;
rt->rt6i_dst.addr = fl6->daddr;
rt->rt6i_dst.plen = 128;
rt->rt6i_idev = idev;
dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
- spin_lock_bh(&icmp6_dst_lock);
- rt->dst.next = icmp6_dst_gc_list;
- icmp6_dst_gc_list = &rt->dst;
- spin_unlock_bh(&icmp6_dst_lock);
-
- fib6_force_start_gc(net);
+ /* Add this dst into uncached_list so that rt6_ifdown() can
+ * do proper release of the net_device
+ */
+ rt6_uncached_list_add(rt);
dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
@@ -1692,48 +1692,6 @@ out:
return dst;
}
-int icmp6_dst_gc(void)
-{
- struct dst_entry *dst, **pprev;
- int more = 0;
-
- spin_lock_bh(&icmp6_dst_lock);
- pprev = &icmp6_dst_gc_list;
-
- while ((dst = *pprev) != NULL) {
- if (!atomic_read(&dst->__refcnt)) {
- *pprev = dst->next;
- dst_free(dst);
- } else {
- pprev = &dst->next;
- ++more;
- }
- }
-
- spin_unlock_bh(&icmp6_dst_lock);
-
- return more;
-}
-
-static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
- void *arg)
-{
- struct dst_entry *dst, **pprev;
-
- spin_lock_bh(&icmp6_dst_lock);
- pprev = &icmp6_dst_gc_list;
- while ((dst = *pprev) != NULL) {
- struct rt6_info *rt = (struct rt6_info *) dst;
- if (func(rt, arg)) {
- *pprev = dst->next;
- dst_free(dst);
- } else {
- pprev = &dst->next;
- }
- }
- spin_unlock_bh(&icmp6_dst_lock);
-}
-
static int ip6_dst_gc(struct dst_ops *ops)
{
struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
@@ -2130,7 +2088,7 @@ out:
if (idev)
in6_dev_put(idev);
if (rt)
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
return ERR_PTR(err);
}
@@ -2160,7 +2118,7 @@ int ip6_route_add(struct fib6_config *cfg,
return err;
out:
if (rt)
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
return err;
}
@@ -2171,8 +2129,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
struct fib6_table *table;
struct net *net = dev_net(rt->dst.dev);
- if (rt == net->ipv6.ip6_null_entry ||
- rt->dst.flags & DST_NOCACHE) {
+ if (rt == net->ipv6.ip6_null_entry) {
err = -ENOENT;
goto out;
}
@@ -2397,7 +2354,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
if (ip6_ins_rt(nrt))
- goto out;
+ goto out_release;
netevent.old = &rt->dst;
netevent.new = &nrt->dst;
@@ -2410,6 +2367,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
ip6_del_rt(rt);
}
+out_release:
+ /* Release the reference taken in
+ * ip6_rt_cache_alloc()
+ */
+ dst_release(&nrt->dst);
+
out:
neigh_release(neigh);
}
@@ -2757,9 +2720,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
rt->rt6i_dst.plen = 128;
tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
rt->rt6i_table = fib6_get_table(net, tb_id);
- rt->dst.flags |= DST_NOCACHE;
-
- atomic_set(&rt->dst.__refcnt, 1);
return rt;
}
@@ -2847,7 +2807,6 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
};
fib6_clean_all(net, fib6_ifdown, &adn);
- icmp6_clean_all(fib6_ifdown, &adn);
if (dev)
rt6_uncached_list_flush_dev(net, dev);
}
@@ -3185,7 +3144,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
if (err) {
- dst_free(&rt->dst);
+ dst_release_immediate(&rt->dst);
goto cleanup;
}
@@ -3249,7 +3208,7 @@ add_errout:
cleanup:
list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
if (nh->rt6_info)
- dst_free(&nh->rt6_info->dst);
+ dst_release_immediate(&nh->rt6_info->dst);
kfree(nh->mxc.mx);
list_del(&nh->next);
kfree(nh);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2e9b52bded2d..2b33847bf931 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -920,12 +920,11 @@ static void udp_v6_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
if (dst) {
- if (dst->flags & DST_NOCACHE) {
- if (likely(atomic_inc_not_zero(&dst->__refcnt)))
- skb_dst_set(skb, dst);
- } else {
- skb_dst_set_noref(skb, dst);
- }
+ /* set noref for now.
+ * any place which wants to hold dst has to call
+ * dst_hold_safe()
+ */
+ skb_dst_set_noref(skb, dst);
}
}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ed4e52d95172..af8e38f47b5b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1590,7 +1590,9 @@ static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
struct dst_entry *dst = &xdst->u.dst;
- dst_free(dst);
+ /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+ dst->obsolete = DST_OBSOLETE_DEAD;
+ dst_release_immediate(dst);
}
static const struct flow_cache_ops xfrm_bundle_fc_ops = {
@@ -1620,7 +1622,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
default:
BUG();
}
- xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0);
+ xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
if (likely(xdst)) {
struct dst_entry *dst = &xdst->u.dst;
@@ -1723,10 +1725,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
if (!dst_prev)
dst0 = dst1;
- else {
- dst_prev->child = dst_clone(dst1);
- dst1->flags |= DST_NOHASH;
- }
+ else
+ /* Ref count is taken during xfrm_alloc_dst()
+ * No need to do dst_clone() on dst1
+ */
+ dst_prev->child = dst1;
xdst->route = dst;
dst_copy_metrics(dst1, dst);
@@ -1792,7 +1795,7 @@ put_states:
xfrm_state_put(xfrm[i]);
free_dst:
if (dst0)
- dst_free(dst0);
+ dst_release_immediate(dst0);
dst0 = ERR_PTR(err);
goto out;
}
@@ -2073,7 +2076,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
pol_dead |= pols[i]->walk.dead;
}
if (pol_dead) {
- dst_free(&xdst->u.dst);
+ /* Mark DST_OBSOLETE_DEAD to fail the next
+ * xfrm_dst_check()
+ */
+ xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+ dst_release_immediate(&xdst->u.dst);
xdst = NULL;
num_pols = 0;
num_xfrms = 0;
@@ -2120,11 +2127,12 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
if (xdst) {
/* The policies were stolen for newly generated bundle */
xdst->num_pols = 0;
- dst_free(&xdst->u.dst);
+ /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+ xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+ dst_release_immediate(&xdst->u.dst);
}
- /* Flow cache does not have reference, it dst_free()'s,
- * but we do need to return one reference for original caller */
+ /* We do need to return one reference for original caller */
dst_hold(&new_xdst->u.dst);
return &new_xdst->flo;
@@ -2147,9 +2155,11 @@ make_dummy_bundle:
inc_error:
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
error:
- if (xdst != NULL)
- dst_free(&xdst->u.dst);
- else
+ if (xdst != NULL) {
+ /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+ xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+ dst_release_immediate(&xdst->u.dst);
+ } else
xfrm_pols_put(pols, num_pols);
return ERR_PTR(err);
}
@@ -2221,7 +2231,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
}
dst_hold(&xdst->u.dst);
- xdst->u.dst.flags |= DST_NOCACHE;
route = xdst->route;
}
}
@@ -2636,10 +2645,12 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
* notice. That's what we are validating here via the
* stale_bundle() check.
*
- * When a policy's bundle is pruned, we dst_free() the XFRM
- * dst which causes it's ->obsolete field to be set to
- * DST_OBSOLETE_DEAD. If an XFRM dst has been pruned like
- * this, we want to force a new route lookup.
+ * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will
+ * be marked on it.
+ * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
+ * be marked on it.
+ * Both will force stable_bundle() to fail on any xdst bundle with
+ * this dst linked in it.
*/
if (dst->obsolete < 0 && !stale_bundle(dst))
return dst;