summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorShakeel Butt <shakeelb@google.com>2022-10-24 07:28:41 +0200
committerAndrew Morton <akpm@linux-foundation.org>2022-12-01 00:58:40 +0100
commitf1a7941243c102a44e8847e3b94ff4ff3ec56f25 (patch)
tree3d1f4938bbf28c7b35003c1d18bbfce011a61480 /mm/memory.c
parentselftests/damon: add tests for DAMON_LRU_SORT's enabled parameter (diff)
downloadlinux-f1a7941243c102a44e8847e3b94ff4ff3ec56f25.tar.xz
linux-f1a7941243c102a44e8847e3b94ff4ff3ec56f25.zip
mm: convert mm's rss stats into percpu_counter
Currently mm_struct maintains rss_stats which are updated on page fault and the unmapping codepaths. For page fault codepath the updates are cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. The reason for caching is performance for multithreaded applications otherwise the rss_stats updates may become hotspot for such applications. However this optimization comes with the cost of error margin in the rss stats. The rss_stats for applications with large number of threads can be very skewed. At worst the error margin is (nr_threads * 64) and we have a lot of applications with 100s of threads, so the error margin can be very high. Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32. Recently we started seeing the unbounded errors for rss_stats for specific applications which use TCP rx0cp. It seems like vm_insert_pages() codepath does not sync rss_stats at all. This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However this conversion enable us to get the accurate stats for situations where accuracy is more important than the cpu cost. This patch does not make such tradeoffs - we can just use percpu_counter_add_local() for the updates and percpu_counter_sum() (or percpu_counter_sync() + percpu_counter_read) for the readers. At the moment the readers are either procfs interface, oom_killer and memory reclaim which I think are not performance critical and should be ok with slow read. However I think we can make that change in a separate patch. Link: https://lkml.kernel.org/r/20221024052841.3291983-1-shakeelb@google.com Signed-off-by: Shakeel Butt <shakeelb@google.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c73
1 files changed, 11 insertions, 62 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 7826143ec9cd..e0555ddd71b5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -162,58 +162,11 @@ static int __init init_zero_pfn(void)
}
early_initcall(init_zero_pfn);
-void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
+void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
- trace_rss_stat(mm, member, count);
+ trace_rss_stat(mm, member);
}
-#if defined(SPLIT_RSS_COUNTING)
-
-void sync_mm_rss(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- if (current->rss_stat.count[i]) {
- add_mm_counter(mm, i, current->rss_stat.count[i]);
- current->rss_stat.count[i] = 0;
- }
- }
- current->rss_stat.events = 0;
-}
-
-static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
-{
- struct task_struct *task = current;
-
- if (likely(task->mm == mm))
- task->rss_stat.count[member] += val;
- else
- add_mm_counter(mm, member, val);
-}
-#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
-#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
-
-/* sync counter once per 64 page faults */
-#define TASK_RSS_EVENTS_THRESH (64)
-static void check_sync_rss_stat(struct task_struct *task)
-{
- if (unlikely(task != current))
- return;
- if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
- sync_mm_rss(task->mm);
-}
-#else /* SPLIT_RSS_COUNTING */
-
-#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
-#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
-
-static void check_sync_rss_stat(struct task_struct *task)
-{
-}
-
-#endif /* SPLIT_RSS_COUNTING */
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -1857,7 +1810,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
@@ -3153,12 +3106,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter_fast(mm,
- mm_counter_file(old_page));
- inc_mm_counter_fast(mm, MM_ANONPAGES);
+ dec_mm_counter(mm, mm_counter_file(old_page));
+ inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
- inc_mm_counter_fast(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
@@ -3965,8 +3917,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (should_try_to_free_swap(folio, vma, vmf->flags))
folio_free_swap(folio);
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
/*
@@ -4146,7 +4098,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_MISSING);
}
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
@@ -4336,11 +4288,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
entry = pte_mkuffd_wp(pte_wrprotect(entry));
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
- inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
@@ -5192,9 +5144,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);
- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
-
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))