summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c324
1 files changed, 192 insertions, 132 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fe715daeb8bc..8ed1b775bdc9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
}
+
+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
+{
+ struct mem_cgroup *root = sc->target_mem_cgroup;
+ return !mem_cgroup_disabled() &&
+ mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}
+
+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
+{
+ return false;
+}
#endif
unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -174,14 +186,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
}
/*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
*/
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
{
- atomic_long_set(&shrinker->nr_in_batch, 0);
+ size_t size = sizeof(*shrinker->nr_deferred);
+
+ /*
+ * If we only have one possible node in the system anyway, save
+ * ourselves the trouble and disable NUMA aware behavior. This way we
+ * will save memory and some small loop time later.
+ */
+ if (nr_node_ids == 1)
+ shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+ if (shrinker->flags & SHRINKER_NUMA_AWARE)
+ size *= nr_node_ids;
+
+ shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+ if (!shrinker->nr_deferred)
+ return -ENOMEM;
+
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
+ return 0;
}
EXPORT_SYMBOL(register_shrinker);
@@ -196,15 +225,102 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
-static inline int do_shrinker_shrink(struct shrinker *shrinker,
- struct shrink_control *sc,
- unsigned long nr_to_scan)
-{
- sc->nr_to_scan = nr_to_scan;
- return (*shrinker->shrink)(shrinker, sc);
+#define SHRINK_BATCH 128
+
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+ long max_pass;
+ long nr;
+ long new_nr;
+ int nid = shrinkctl->nid;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+
+ max_pass = shrinker->count_objects(shrinker, shrinkctl);
+ if (max_pass == 0)
+ return 0;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+ total_scan = nr;
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta *= max_pass;
+ do_div(delta, lru_pages + 1);
+ total_scan += delta;
+ if (total_scan < 0) {
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
+ shrinker->scan_objects, total_scan);
+ total_scan = max_pass;
+ }
+
+ /*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
+
+ while (total_scan >= batch_size) {
+ unsigned long ret;
+
+ shrinkctl->nr_to_scan = batch_size;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+
+ count_vm_events(SLABS_SCANNED, batch_size);
+ total_scan -= batch_size;
+
+ cond_resched();
+ }
+
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ if (total_scan > 0)
+ new_nr = atomic_long_add_return(total_scan,
+ &shrinker->nr_deferred[nid]);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+ trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+ return freed;
}
-#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
*
@@ -224,115 +340,45 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(struct shrink_control *shrink,
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
- unsigned long ret = 0;
+ unsigned long freed = 0;
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
- /* Assume we'll be able to shrink next time */
- ret = 1;
+ /*
+ * If we would return 0, our callers would understand that we
+ * have nothing else to shrink and give up trying. By returning
+ * 1 we keep it going and assume we'll be able to shrink next
+ * time.
+ */
+ freed = 1;
goto out;
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- unsigned long long delta;
- long total_scan;
- long max_pass;
- int shrink_ret = 0;
- long nr;
- long new_nr;
- long batch_size = shrinker->batch ? shrinker->batch
- : SHRINK_BATCH;
-
- max_pass = do_shrinker_shrink(shrinker, shrink, 0);
- if (max_pass <= 0)
- continue;
-
- /*
- * copy the current shrinker scan count into a local variable
- * and zero it so that other concurrent shrinker invocations
- * don't also do this scanning work.
- */
- nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
- total_scan = nr;
- delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= max_pass;
- do_div(delta, lru_pages + 1);
- total_scan += delta;
- if (total_scan < 0) {
- printk(KERN_ERR "shrink_slab: %pF negative objects to "
- "delete nr=%ld\n",
- shrinker->shrink, total_scan);
- total_scan = max_pass;
- }
-
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * max_pass. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (delta < max_pass / 4)
- total_scan = min(total_scan, max_pass / 2);
-
- /*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
- */
- if (total_scan > max_pass * 2)
- total_scan = max_pass * 2;
-
- trace_mm_shrink_slab_start(shrinker, shrink, nr,
- nr_pages_scanned, lru_pages,
- max_pass, delta, total_scan);
-
- while (total_scan >= batch_size) {
- int nr_before;
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (!node_online(shrinkctl->nid))
+ continue;
- nr_before = do_shrinker_shrink(shrinker, shrink, 0);
- shrink_ret = do_shrinker_shrink(shrinker, shrink,
- batch_size);
- if (shrink_ret == -1)
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+ (shrinkctl->nid != 0))
break;
- if (shrink_ret < nr_before)
- ret += nr_before - shrink_ret;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
- cond_resched();
- }
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
- /*
- * move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates. If we exhausted the
- * scan, there is no need to do an update.
- */
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
- &shrinker->nr_in_batch);
- else
- new_nr = atomic_long_read(&shrinker->nr_in_batch);
-
- trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+ }
}
up_read(&shrinker_rwsem);
out:
cond_resched();
- return ret;
+ return freed;
}
static inline int is_page_cache_freeable(struct page *page)
@@ -2130,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone,
}
}
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static int
+__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
{
unsigned long nr_reclaimed, nr_scanned;
+ int groups_scanned = 0;
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2140,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
.zone = zone,
.priority = sc->priority,
};
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
+ mem_cgroup_iter_filter filter = (soft_reclaim) ?
+ mem_cgroup_soft_reclaim_eligible : NULL;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
- do {
+ while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
struct lruvec *lruvec;
+ groups_scanned++;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
shrink_lruvec(lruvec, sc);
@@ -2168,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
mem_cgroup_iter_break(root, memcg);
break;
}
- memcg = mem_cgroup_iter(root, memcg, &reclaim);
- } while (memcg);
+ }
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
@@ -2177,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
+
+ return groups_scanned;
+}
+
+
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
+{
+ bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
+ unsigned long nr_scanned = sc->nr_scanned;
+ int scanned_groups;
+
+ scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
+ /*
+ * memcg iterator might race with other reclaimer or start from
+ * a incomplete tree walk so the tree walk in __shrink_zone
+ * might have missed groups that are above the soft limit. Try
+ * another loop to catch up with others. Do it just once to
+ * prevent from reclaim latencies when other reclaimers always
+ * preempt this one.
+ */
+ if (do_soft_reclaim && !scanned_groups)
+ __shrink_zone(zone, sc, do_soft_reclaim);
+
+ /*
+ * No group is over the soft limit or those that are do not have
+ * pages in the zone we are reclaiming so we have to reclaim everybody
+ */
+ if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
+ __shrink_zone(zone, sc, false);
+ return;
+ }
}
/* Returns true if compaction should go ahead for a high-order request */
@@ -2240,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- unsigned long nr_soft_reclaimed;
- unsigned long nr_soft_scanned;
bool aborted_reclaim = false;
/*
@@ -2281,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
continue;
}
}
- /*
- * This steals pages from memory cgroups over softlimit
- * and returns the number of reclaimed pages and
- * scanned pages. This works for global memory pressure
- * and balancing, not for a memcg's limit.
- */
- nr_soft_scanned = 0;
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
- sc->order, sc->gfp_mask,
- &nr_soft_scanned);
- sc->nr_reclaimed += nr_soft_reclaimed;
- sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */
}
@@ -2368,12 +2434,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
+
+ nodes_clear(shrink->nodes_to_scan);
for_each_zone_zonelist(zone, z, zonelist,
gfp_zone(sc->gfp_mask)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone_reclaimable_pages(zone);
+ node_set(zone_to_nid(zone),
+ shrink->nodes_to_scan);
}
shrink_slab(shrink, sc->nr_scanned, lru_pages);
@@ -2829,6 +2899,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
return true;
shrink_zone(zone, sc);
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
reclaim_state->reclaimed_slab = 0;
shrink_slab(&shrink, sc->nr_scanned, lru_pages);
@@ -2880,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
- unsigned long nr_soft_reclaimed;
- unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.priority = DEF_PRIORITY,
@@ -2996,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
sc.nr_scanned = 0;
- nr_soft_scanned = 0;
- /*
- * Call soft limit reclaim before calling shrink_zone.
- */
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
- order, sc.gfp_mask,
- &nr_soft_scanned);
- sc.nr_reclaimed += nr_soft_reclaimed;
-
/*
* There should be no need to raise the scanning
* priority if enough pages are already being scanned
@@ -3520,10 +3581,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
- *
- * Note that shrink_slab will free memory on all zones and may
- * take a long time.
*/
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);