// SPDX-License-Identifier: GPL-2.0 /* * DAMON Primitives for Virtual Address Spaces * * Author: SeongJae Park */ #define pr_fmt(fmt) "damon-va: " fmt #include #include #include #include #include #include #include #include "ops-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST #undef DAMON_MIN_REGION #define DAMON_MIN_REGION 1 #endif /* * 't->pid' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ static inline struct task_struct *damon_get_task_struct(struct damon_target *t) { return get_pid_task(t->pid, PIDTYPE_PID); } /* * Get the mm_struct of the given target * * Caller _must_ put the mm_struct after use, unless it is NULL. * * Returns the mm_struct of the target on success, NULL on failure */ static struct mm_struct *damon_get_mm(struct damon_target *t) { struct task_struct *task; struct mm_struct *mm; task = damon_get_task_struct(t); if (!task) return NULL; mm = get_task_mm(task); put_task_struct(task); return mm; } /* * Functions for the initial monitoring target regions construction */ /* * Size-evenly split a region into 'nr_pieces' small regions * * Returns 0 on success, or negative error code otherwise. */ static int damon_va_evenly_split_region(struct damon_target *t, struct damon_region *r, unsigned int nr_pieces) { unsigned long sz_orig, sz_piece, orig_end; struct damon_region *n = NULL, *next; unsigned long start; if (!r || !nr_pieces) return -EINVAL; orig_end = r->ar.end; sz_orig = damon_sz_region(r); sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); if (!sz_piece) return -EINVAL; r->ar.end = r->ar.start + sz_piece; next = damon_next_region(r); for (start = r->ar.end; start + sz_piece <= orig_end; start += sz_piece) { n = damon_new_region(start, start + sz_piece); if (!n) return -ENOMEM; damon_insert_region(n, r, next, t); r = n; } /* complement last region for possible rounding error */ if (n) n->ar.end = orig_end; return 0; } static unsigned long sz_range(struct damon_addr_range *r) { return r->end - r->start; } /* * Find three regions separated by two biggest unmapped regions * * vma the head vma of the target address space * regions an array of three address ranges that results will be saved * * This function receives an address space and finds three regions in it which * separated by the two biggest unmapped regions in the space. Please refer to * below comments of '__damon_va_init_regions()' function to know why this is * necessary. * * Returns 0 if success, or negative error code otherwise. */ static int __damon_va_three_regions(struct mm_struct *mm, struct damon_addr_range regions[3]) { struct damon_addr_range first_gap = {0}, second_gap = {0}; VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma, *prev = NULL; unsigned long start; /* * Find the two biggest gaps so that first_gap > second_gap > others. * If this is too slow, it can be optimised to examine the maple * tree gaps. */ for_each_vma(vmi, vma) { unsigned long gap; if (!prev) { start = vma->vm_start; goto next; } gap = vma->vm_start - prev->vm_end; if (gap > sz_range(&first_gap)) { second_gap = first_gap; first_gap.start = prev->vm_end; first_gap.end = vma->vm_start; } else if (gap > sz_range(&second_gap)) { second_gap.start = prev->vm_end; second_gap.end = vma->vm_start; } next: prev = vma; } if (!sz_range(&second_gap) || !sz_range(&first_gap)) return -EINVAL; /* Sort the two biggest gaps by address */ if (first_gap.start > second_gap.start) swap(first_gap, second_gap); /* Store the result */ regions[0].start = ALIGN(start, DAMON_MIN_REGION); regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); return 0; } /* * Get the three regions in the given target (task) * * Returns 0 on success, negative error code otherwise. */ static int damon_va_three_regions(struct damon_target *t, struct damon_addr_range regions[3]) { struct mm_struct *mm; int rc; mm = damon_get_mm(t); if (!mm) return -EINVAL; mmap_read_lock(mm); rc = __damon_va_three_regions(mm, regions); mmap_read_unlock(mm); mmput(mm); return rc; } /* * Initialize the monitoring target regions for the given target (task) * * t the given target * * Because only a number of small portions of the entire address space * is actually mapped to the memory and accessed, monitoring the unmapped * regions is wasteful. That said, because we can deal with small noises, * tracking every mapping is not strictly required but could even incur a high * overhead if the mapping frequently changes or the number of mappings is * high. The adaptive regions adjustment mechanism will further help to deal * with the noise by simply identifying the unmapped areas as a region that * has no access. Moreover, applying the real mappings that would have many * unmapped areas inside will make the adaptive mechanism quite complex. That * said, too huge unmapped areas inside the monitoring target should be removed * to not take the time for the adaptive mechanism. * * For the reason, we convert the complex mappings to three distinct regions * that cover every mapped area of the address space. Also the two gaps * between the three regions are the two biggest unmapped areas in the given * address space. In detail, this function first identifies the start and the * end of the mappings and the two biggest unmapped areas of the address space. * Then, it constructs the three regions as below: * * [mappings[0]->start, big_two_unmapped_areas[0]->start) * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) * * As usual memory map of processes is as below, the gap between the heap and * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed * region and the stack will be two biggest unmapped regions. Because these * gaps are exceptionally huge areas in usual address space, excluding these * two biggest unmapped regions will be sufficient to make a trade-off. * * * * * (other mmap()-ed regions and small unmapped regions) * * * */ static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { struct damon_target *ti; struct damon_region *r; struct damon_addr_range regions[3]; unsigned long sz = 0, nr_pieces; int i, tidx = 0; if (damon_va_three_regions(t, regions)) { damon_for_each_target(ti, ctx) { if (ti == t) break; tidx++; } pr_debug("Failed to get three regions of %dth target\n", tidx); return; } for (i = 0; i < 3; i++) sz += regions[i].end - regions[i].start; if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; /* Set the initial three regions of the target */ for (i = 0; i < 3; i++) { r = damon_new_region(regions[i].start, regions[i].end); if (!r) { pr_err("%d'th init region creation failed\n", i); return; } damon_add_region(r, t); nr_pieces = (regions[i].end - regions[i].start) / sz; damon_va_evenly_split_region(t, r, nr_pieces); } } /* Initialize '->regions_list' of every target (task) */ static void damon_va_init(struct damon_ctx *ctx) { struct damon_target *t; damon_for_each_target(t, ctx) { /* the user may set the target regions as they want */ if (!damon_nr_regions(t)) __damon_va_init_regions(ctx, t); } } /* * Update regions for current memory mappings */ static void damon_va_update(struct damon_ctx *ctx) { struct damon_addr_range three_regions[3]; struct damon_target *t; damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; damon_set_regions(t, three_regions, 3); } } static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; pmd_t pmde; spinlock_t *ptl; if (pmd_trans_huge(pmdp_get(pmd))) { ptl = pmd_lock(walk->mm, pmd); pmde = pmdp_get(pmd); if (!pmd_present(pmde)) { spin_unlock(ptl); return 0; } if (pmd_trans_huge(pmde)) { damon_pmdp_mkold(pmd, walk->vma, addr); spin_unlock(ptl); return 0; } spin_unlock(ptl); } pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } if (!pte_present(ptep_get(pte))) goto out; damon_ptep_mkold(pte, walk->vma, addr); out: pte_unmap_unlock(pte, ptl); return 0; } #ifdef CONFIG_HUGETLB_PAGE static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { bool referenced = false; pte_t entry = huge_ptep_get(mm, addr, pte); struct folio *folio = pfn_folio(pte_pfn(entry)); unsigned long psize = huge_page_size(hstate_vma(vma)); folio_get(folio); if (pte_young(entry)) { referenced = true; entry = pte_mkold(entry); set_huge_pte_at(mm, addr, pte, entry, psize); } #ifdef CONFIG_MMU_NOTIFIER if (mmu_notifier_clear_young(mm, addr, addr + huge_page_size(hstate_vma(vma)))) referenced = true; #endif /* CONFIG_MMU_NOTIFIER */ if (referenced) folio_set_young(folio); folio_set_idle(folio); folio_put(folio); } static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct hstate *h = hstate_vma(walk->vma); spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(h, walk->mm, pte); entry = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(entry)) goto out; damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); out: spin_unlock(ptl); return 0; } #else #define damon_mkold_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, .hugetlb_entry = damon_mkold_hugetlb_entry, .walk_lock = PGWALK_RDLOCK, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) { mmap_read_lock(mm); walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); mmap_read_unlock(mm); } /* * Functions for the access checking of the regions */ static void __damon_va_prepare_access_check(struct mm_struct *mm, struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); damon_va_mkold(mm, r->sampling_addr); } static void damon_va_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; struct damon_region *r; damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) continue; damon_for_each_region(r, t) __damon_va_prepare_access_check(mm, r); mmput(mm); } } struct damon_young_walk_private { /* size of the folio for the access checked virtual memory address */ unsigned long *folio_sz; bool young; }; static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; pte_t ptent; spinlock_t *ptl; struct folio *folio; struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(pmdp_get(pmd))) { pmd_t pmde; ptl = pmd_lock(walk->mm, pmd); pmde = pmdp_get(pmd); if (!pmd_present(pmde)) { spin_unlock(ptl); return 0; } if (!pmd_trans_huge(pmde)) { spin_unlock(ptl); goto regular_page; } folio = damon_get_folio(pmd_pfn(pmde)); if (!folio) goto huge_out; if (pmd_young(pmde) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = HPAGE_PMD_SIZE; folio_put(folio); huge_out: spin_unlock(ptl); return 0; } regular_page: #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } ptent = ptep_get(pte); if (!pte_present(ptent)) goto out; folio = damon_get_folio(pte_pfn(ptent)); if (!folio) goto out; if (pte_young(ptent) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = folio_size(folio); folio_put(folio); out: pte_unmap_unlock(pte, ptl); return 0; } #ifdef CONFIG_HUGETLB_PAGE static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct damon_young_walk_private *priv = walk->private; struct hstate *h = hstate_vma(walk->vma); struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(h, walk->mm, pte); entry = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(entry)) goto out; folio = pfn_folio(pte_pfn(entry)); folio_get(folio); if (pte_young(entry) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = huge_page_size(h); folio_put(folio); out: spin_unlock(ptl); return 0; } #else #define damon_young_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, .hugetlb_entry = damon_young_hugetlb_entry, .walk_lock = PGWALK_RDLOCK, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, unsigned long *folio_sz) { struct damon_young_walk_private arg = { .folio_sz = folio_sz, .young = false, }; mmap_read_lock(mm); walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); mmap_read_unlock(mm); return arg.young; } /* * Check whether the region was accessed after the last preparation * * mm 'mm_struct' for the given virtual address space * r the region to be checked */ static void __damon_va_check_access(struct mm_struct *mm, struct damon_region *r, bool same_target, struct damon_attrs *attrs) { static unsigned long last_addr; static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; if (!mm) { damon_update_region_access_rate(r, false, attrs); return; } /* If the region is in the last checked page, reuse the result */ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { damon_update_region_access_rate(r, last_accessed, attrs); return; } last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); damon_update_region_access_rate(r, last_accessed, attrs); last_addr = r->sampling_addr; } static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; struct damon_region *r; unsigned int max_nr_accesses = 0; bool same_target; damon_for_each_target(t, ctx) { mm = damon_get_mm(t); same_target = false; damon_for_each_region(r, t) { __damon_va_check_access(mm, r, same_target, &ctx->attrs); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); same_target = true; } if (mm) mmput(mm); } return max_nr_accesses; } /* * Functions for the target validity check and cleanup */ static bool damon_va_target_valid(struct damon_target *t) { struct task_struct *task; task = damon_get_task_struct(t); if (task) { put_task_struct(task); return true; } return false; } #ifndef CONFIG_ADVISE_SYSCALLS static unsigned long damos_madvise(struct damon_target *target, struct damon_region *r, int behavior) { return 0; } #else static unsigned long damos_madvise(struct damon_target *target, struct damon_region *r, int behavior) { struct mm_struct *mm; unsigned long start = PAGE_ALIGN(r->ar.start); unsigned long len = PAGE_ALIGN(damon_sz_region(r)); unsigned long applied; mm = damon_get_mm(target); if (!mm) return 0; applied = do_madvise(mm, start, len, behavior) ? 0 : len; mmput(mm); return applied; } #endif /* CONFIG_ADVISE_SYSCALLS */ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) { int madv_action; switch (scheme->action) { case DAMOS_WILLNEED: madv_action = MADV_WILLNEED; break; case DAMOS_COLD: madv_action = MADV_COLD; break; case DAMOS_PAGEOUT: madv_action = MADV_PAGEOUT; break; case DAMOS_HUGEPAGE: madv_action = MADV_HUGEPAGE; break; case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; case DAMOS_STAT: return 0; default: /* * DAMOS actions that are not yet supported by 'vaddr'. */ return 0; } return damos_madvise(t, r, madv_action); } static int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: return damon_cold_score(context, r, scheme); default: break; } return DAMOS_MAX_SCORE; } static int __init damon_va_initcall(void) { struct damon_operations ops = { .id = DAMON_OPS_VADDR, .init = damon_va_init, .update = damon_va_update, .prepare_access_checks = damon_va_prepare_access_checks, .check_accesses = damon_va_check_accesses, .reset_aggregated = NULL, .target_valid = damon_va_target_valid, .cleanup = NULL, .apply_scheme = damon_va_apply_scheme, .get_scheme_score = damon_va_scheme_score, }; /* ops for fixed virtual address ranges */ struct damon_operations ops_fvaddr = ops; int err; /* Don't set the monitoring target regions for the entire mapping */ ops_fvaddr.id = DAMON_OPS_FVADDR; ops_fvaddr.init = NULL; ops_fvaddr.update = NULL; err = damon_register_ops(&ops); if (err) return err; return damon_register_ops(&ops_fvaddr); }; subsys_initcall(damon_va_initcall); #include "tests/vaddr-kunit.h"