From 6afb5157b9eba4092e2f0f54d24a3806409bdde5 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Wed, 19 May 2010 17:42:14 +0800 Subject: x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions No behavior change. Move some of vmalloc_sync_all() code into a new function sync_global_pgds() that will be useful for memory hotplug. Signed-off-by: Haicheng Li LKML-Reference: <4C6E4ECD.1090607@linux.intel.com> Reviewed-by: Wu Fengguang Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a20..61a1b4fdecbf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -97,6 +97,36 @@ static int __init nonx32_setup(char *str) } __setup("noexec32=", nonx32_setup); +/* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. -- cgit v1.2.3 From 9b861528a8012e7bc4d1f7bae07395b225331477 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Fri, 20 Aug 2010 17:50:16 +0800 Subject: x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes When memory hotplug-adding happens for a large enough area that a new PGD entry is needed for the direct mapping, the PGDs of other processes would not get updated. This leads to some CPUs oopsing like below when they have to access the unmapped areas. [ 1139.243192] BUG: soft lockup - CPU#0 stuck for 61s! [bash:6534] [ 1139.243195] Modules linked in: ipv6 autofs4 rfcomm l2cap crc16 bluetooth rfkill binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath dm_mod video output sbs sbshc fan battery ac parport_pc lp parport joydev usbhid processor thermal thermal_sys container button rtc_cmos rtc_core rtc_lib i2c_i801 i2c_core pcspkr uhci_hcd ohci_hcd ehci_hcd usbcore [ 1139.243229] irq event stamp: 8538759 [ 1139.243230] hardirqs last enabled at (8538759): [] restore_args+0x0/0x30 [ 1139.243236] hardirqs last disabled at (8538757): [] __do_softirq+0x106/0x146 [ 1139.243240] softirqs last enabled at (8538758): [] __do_softirq+0x137/0x146 [ 1139.243245] softirqs last disabled at (8538743): [] call_softirq+0x1c/0x34 [ 1139.243249] CPU 0: [ 1139.243250] Modules linked in: ipv6 autofs4 rfcomm l2cap crc16 bluetooth rfkill binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath dm_mod video output sbs sbshc fan battery ac parport_pc lp parport joydev usbhid processor thermal thermal_sys container button rtc_cmos rtc_core rtc_lib i2c_i801 i2c_core pcspkr uhci_hcd ohci_hcd ehci_hcd usbcore [ 1139.243284] Pid: 6534, comm: bash Tainted: G M 2.6.32-haicheng-cpuhp #7 QSSC-S4R [ 1139.243287] RIP: 0010:[] [] alloc_arraycache+0x35/0x69 [ 1139.243292] RSP: 0018:ffff8802799f9d78 EFLAGS: 00010286 [ 1139.243295] RAX: ffff8884ffc00000 RBX: ffff8802799f9d98 RCX: 0000000000000000 [ 1139.243297] RDX: 0000000000190018 RSI: 0000000000000001 RDI: ffff8884ffc00010 [ 1139.243300] RBP: ffffffff8100c34e R08: 0000000000000002 R09: 0000000000000000 [ 1139.243303] R10: ffffffff8246dda0 R11: 000000d08246dda0 R12: ffff8802599bfff0 [ 1139.243305] R13: ffff88027904c040 R14: ffff8802799f8000 R15: 0000000000000001 [ 1139.243308] FS: 00007fe81bfe86e0(0000) GS:ffff88000d800000(0000) knlGS:0000000000000000 [ 1139.243311] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1139.243313] CR2: ffff8884ffc00000 CR3: 000000026cf2d000 CR4: 00000000000006f0 [ 1139.243316] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1139.243318] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1139.243321] Call Trace: [ 1139.243324] [] ? alloc_arraycache+0x29/0x69 [ 1139.243328] [] ? cpuup_callback+0x1b0/0x32a [ 1139.243333] [] ? notifier_call_chain+0x33/0x5b [ 1139.243337] [] ? __raw_notifier_call_chain+0x9/0xb [ 1139.243340] [] ? cpu_up+0xb3/0x152 [ 1139.243344] [] ? store_online+0x4d/0x75 [ 1139.243348] [] ? sysdev_store+0x1b/0x1d [ 1139.243351] [] ? sysfs_write_file+0xe5/0x121 [ 1139.243355] [] ? vfs_write+0xae/0x14a [ 1139.243358] [] ? sys_write+0x47/0x6f [ 1139.243362] [] ? system_call_fastpath+0x16/0x1b This patch makes sure to always replicate new direct mapping PGD entries to the PGDs of all processes, as well as ensures corresponding vmemmap mapping gets synced. V1: initial code by Andi Kleen. V2: fix several issues found in testing. V3: as suggested by Wu Fengguang, reuse common code of vmalloc_sync_all(). [ hpa: changed pgd_change from int to bool ] Originally-by: Andi Kleen Signed-off-by: Haicheng Li LKML-Reference: <4C6E4FD8.6080100@linux.intel.com> Reviewed-by: Wu Fengguang Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 61a1b4fdecbf..64e7bc2ded93 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -564,8 +564,9 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - + bool pgd_changed = false; unsigned long next, last_map_addr = end; + unsigned long addr; start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -593,7 +594,12 @@ kernel_physical_mapping_init(unsigned long start, spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; } + + if (pgd_changed) + sync_global_pgds(addr, end); + __flush_tlb_all(); return last_map_addr; @@ -1033,6 +1039,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) } } + sync_global_pgds((unsigned long)start_page, end); return 0; } -- cgit v1.2.3 From 1c5f50ee347daea013671f718b70cd6bf497bef9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 3 Sep 2010 17:04:07 +0800 Subject: x86, mm: fix uninitialized addr in kernel_physical_mapping_init() This re-adds the lost chunk in commit 9b861528a80. Reported-by: Stephen Rothwell Signed-off-by: Wu Fengguang Cc: Peter Zijlstra Cc: Haicheng Li Cc: Andi Kleen LKML-Reference: <20100903090407.GA19771@localhost> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 64e7bc2ded93..74f0f358b07b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -570,6 +570,7 @@ kernel_physical_mapping_init(unsigned long start, start = (unsigned long)__va(start); end = (unsigned long)__va(end); + addr = start; for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); -- cgit v1.2.3 From 44235dcde416104b8e1db7606c283f4c0149c760 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 14 Oct 2010 17:04:59 -0700 Subject: x86, mm: Fix bogus whitespace in sync_global_pgds() Whitespace cleanup only. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 74f0f358b07b..1ad7c0ff5d2b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -103,28 +103,28 @@ __setup("noexec32=", nonx32_setup); */ void sync_global_pgds(unsigned long start, unsigned long end) { - unsigned long address; - - for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) - != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } } /* -- cgit v1.2.3 From 617d34d9e5d8326ec8f188c616aa06ac59d083fe Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 21 Sep 2010 12:01:51 -0700 Subject: x86, mm: Hold mm->page_table_lock while doing vmalloc_sync Take mm->page_table_lock while syncing the vmalloc region. This prevents a race with the Xen pagetable pin/unpin code, which expects that the page_table_lock is already held. If this race occurs, then Xen can see an inconsistent page type (a page can either be read/write or a pagetable page, and pin/unpin converts it between them), which will cause either the pin or the set_p[gm]d to fail; either will crash the kernel. vmalloc_sync_all() should be called rarely, so this extra use of page_table_lock should not interfere with its normal users. The mm pointer is stashed in the pgd page's index field, as that won't be otherwise used for pgds. Reported-by: Ian Campbell Originally-by: Jan Beulich LKML-Reference: <4CB88A4C.1080305@goop.org> Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 2 ++ arch/x86/mm/fault.c | 11 ++++++++++- arch/x86/mm/init_64.c | 7 +++++++ arch/x86/mm/pgtable.c | 20 +++++++++++++++++--- 4 files changed, 36 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2d0a33bd2971..ada823a13c7c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; extern spinlock_t pgd_lock; extern struct list_head pgd_list; +extern struct mm_struct *pgd_page_get_mm(struct page *page); + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index caec22906d7c..6c27c39f8a37 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -229,7 +229,16 @@ void vmalloc_sync_all(void) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) + spinlock_t *pgt_lock; + int ret; + + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + ret = vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); + + if (!ret) break; } spin_unlock_irqrestore(&pgd_lock, flags); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ad7c0ff5d2b..4d323fb770c2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -116,12 +116,19 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + spinlock_t *pgt_lock; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); } spin_unlock_irqrestore(&pgd_lock, flags); } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..c70e57dbb491 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(pgd_t *pgd) + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd) } /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); pgd_list_add(pgd); + } } static void pgd_dtor(pgd_t *pgd) @@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) */ spin_lock_irqsave(&pgd_lock, flags); - pgd_ctor(pgd); + pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); spin_unlock_irqrestore(&pgd_lock, flags); -- cgit v1.2.3