From 672ca28e300c17bf8d792a2a7a8631193e580c74 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 23 Jun 2008 11:21:37 -0700 Subject: Fix ZERO_PAGE breakage with vmware Commit 89f5b7da2a6bad2e84670422ab8192382a5aeb9f ("Reinstate ZERO_PAGE optimization in 'get_user_pages()' and fix XIP") broke vmware, as reported by Jeff Chua: "This broke vmware 6.0.4. Jun 22 14:53:03.845: vmx| NOT_IMPLEMENTED /build/mts/release/bora-93057/bora/vmx/main/vmmonPosix.c:774" and the reason seems to be that there's an old bug in how we handle do FOLL_ANON on VM_SHARED areas in get_user_pages(), but since it only triggered if the whole page table was missing, nobody had apparently hit it before. The recent changes to 'follow_page()' made the FOLL_ANON logic trigger not just for whole missing page tables, but for individual pages as well, and exposed this problem. This fixes it by making the test for when FOLL_ANON is used more careful, and also makes the code easier to read and understand by moving the logic to a separate inline function. Reported-and-tested-by: Jeff Chua Signed-off-by: Linus Torvalds --- mm/memory.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 9aefaae46858..423e0e7c2f73 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1045,6 +1045,26 @@ no_page_table: return page; } +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault or a nopfn routine, it's not an + * anonymous region. + */ + return !vma->vm_ops || + (!vma->vm_ops->fault && !vma->vm_ops->nopfn); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -1119,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags = FOLL_TOUCH; if (pages) foll_flags |= FOLL_GET; - if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->fault)) + if (!write && use_zero_page(vma)) foll_flags |= FOLL_ANON; do { -- cgit v1.2.3 From 945754a1754f9d4c2974a8241ad4f92fad7f3a6a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 23 Jun 2008 14:30:30 +0200 Subject: mm: fix race in COW logic There is a race in the COW logic. It contains a shortcut to avoid the COW and reuse the page if we have the sole reference on the page, however it is possible to have two racing do_wp_page()ers with one causing the other to mistakenly believe it is safe to take the shortcut when it is not. This could lead to data corruption. Process 1 and process2 each have a wp pte of the same anon page (ie. one forked the other). The page's mapcount is 2. Then they both attempt to write to it around the same time... proc1 proc2 thr1 proc2 thr2 CPU0 CPU1 CPU3 do_wp_page() do_wp_page() trylock_page() can_share_swap_page() load page mapcount (==2) reuse = 0 pte unlock copy page to new_page pte lock page_remove_rmap(page); trylock_page() can_share_swap_page() load page mapcount (==1) reuse = 1 ptep_set_access_flags (allow W) write private key into page read from page ptep_clear_flush() set_pte_at(pte of new_page) Fix this by moving the page_remove_rmap of the old page after the pte clear and flush. Potentially the entire branch could be moved down here, but in order to stay consistent, I won't (should probably move all the *_mm_counter stuff with one patch). Signed-off-by: Nick Piggin Acked-by: Hugh Dickins Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 423e0e7c2f73..d14b251a25a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1785,7 +1785,6 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1807,6 +1806,32 @@ gotten: lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma); + } + /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; -- cgit v1.2.3