diff options
Diffstat (limited to 'mm/workingset.c')
-rw-r--r-- | mm/workingset.c | 201 |
1 files changed, 117 insertions, 84 deletions
diff --git a/mm/workingset.c b/mm/workingset.c index 4516dd790129..d46f8c92aa2f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -121,7 +121,7 @@ * the only thing eating into inactive list space is active pages. * * - * Activating refaulting pages + * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given @@ -134,6 +134,10 @@ * used less frequently than the refaulting page - or even not used at * all anymore. * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. @@ -141,6 +145,14 @@ * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * * * Implementation * @@ -148,21 +160,20 @@ * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the node) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ -#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) +#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ + 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of - * actionable refaults. However, bits are tight in the radix tree + * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group @@ -170,23 +181,27 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) { eviction >>= bucket_order; + eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + eviction = (eviction << 1) | workingset; - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, bool *workingsetp) { - unsigned long entry = (unsigned long)shadow; + unsigned long entry = xa_to_value(shadow); int memcgid, nid; + bool workingset; - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + workingset = entry & 1; + entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -195,6 +210,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *workingsetp = workingset; } /** @@ -207,8 +223,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg = page_memcg(page); struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -220,30 +236,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, pgdat, eviction); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } /** * workingset_refault - evaluate the refault of a previously evicted page + * @page: the freshly allocated replacement page * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously * evicted page in the context of the node it was allocated in. - * - * Returns %true if the page should be activated, %false otherwise. */ -bool workingset_refault(void *shadow) +void workingset_refault(struct page *page, void *shadow) { unsigned long refault_distance; + struct pglist_data *pgdat; unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct pglist_data *pgdat; + bool workingset; int memcgid; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* @@ -263,41 +279,51 @@ bool workingset_refault(void *shadow) * configurations instead. */ memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) { - rcu_read_unlock(); - return false; - } + if (!mem_cgroup_disabled() && !memcg) + goto out; lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. + * Calculate the refault distance * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * inactive_age to lap a shadow entry in the field, which can + * then result in a false small refault distance, leading to a + * false activation should this old entry actually refault + * again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - if (refault_distance <= active_file) { - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); - rcu_read_unlock(); - return true; + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ + if (refault_distance > active_file) + goto out; + + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } +out: rcu_read_unlock(); - return false; } /** @@ -340,7 +366,7 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node) +void workingset_update_node(struct xa_node *node) { /* * Track non-empty nodes that contain only shadow entries; @@ -350,12 +376,20 @@ void workingset_update_node(struct radix_tree_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - if (node->count && node->count == node->exceptional) { - if (list_empty(&node->private_list)) + VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); + __inc_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } else { - if (!list_empty(&node->private_list)) + if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); + __dec_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } } @@ -364,12 +398,12 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, { unsigned long max_nodes; unsigned long nodes; - unsigned long cache; + unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); /* - * Approximate a reasonable limit for the radix tree nodes + * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. @@ -384,20 +418,26 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * - * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ +#ifdef CONFIG_MEMCG if (sc->memcg) { - cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + struct lruvec *lruvec; + + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL); + lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + } else +#endif + pages = node_present_pages(sc->nid); + + max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; @@ -410,11 +450,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, - void *arg) + void *arg) __must_hold(lru_lock) { + struct xa_node *node = container_of(item, struct xa_node, private_list); + XA_STATE(xas, node->array, 0); struct address_space *mapping; - struct radix_tree_node *node; - unsigned int i; int ret; /* @@ -422,15 +462,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any - * address_space that has radix tree nodes on the LRU. + * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, i_pages); + mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { @@ -440,6 +479,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); + __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + spin_unlock(lru_lock); /* @@ -447,29 +488,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(node->count != node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[i]) { - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) - goto out_invalid; - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(!mapping->nrexceptional)) - goto out_invalid; - node->slots[i] = NULL; - node->exceptional--; - node->count--; - mapping->nrexceptional--; - } - } - if (WARN_ON_ONCE(node->exceptional)) + if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->i_pages, node, - workingset_lookup_update(mapping)); + mapping->nrexceptional -= node->nr_values; + xas.xa_node = xa_parent_locked(&mapping->i_pages, node); + xas.xa_offset = node->offset; + xas.xa_shift = node->shift + XA_CHUNK_SHIFT; + xas_set_update(&xas, workingset_update_node); + /* + * We could store a shadow entry here which was the minimum of the + * shadow entries we were tracking ... + */ + xas_store(&xas, NULL); + __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); @@ -491,7 +524,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, - .seeks = DEFAULT_SEEKS, + .seeks = 0, /* ->count reports only fully expendable nodes */ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; |