From 42b43341b045227c5ffca04c2d9424c77abed4fb Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 12 Nov 2013 15:06:46 -0800 Subject: sh64: kernel: use 'usp' instead of 'fn' 'fn' is not defined, according to the implementation copy_thread() in 'sh32', need use 'usp' instead of. Signed-off-by: Chen Gang Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/process_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index 174d124b419e..eac5947ac063 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -393,7 +393,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, if (unlikely(p->flags & PF_KTHREAD)) { memset(childregs, 0, sizeof(struct pt_regs)); childregs->regs[2] = (unsigned long)arg; - childregs->regs[3] = (unsigned long)fn; + childregs->regs[3] = (unsigned long)usp; childregs->sr = (1 << 30); /* not user_mode */ childregs->sr |= SR_FD; /* Invalidate FPU flag */ p->thread.pc = (unsigned long) ret_from_kernel_thread; -- cgit v1.2.3 From 8d28df813e5077913a60c61aaef2e6bf940d8cd9 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 12 Nov 2013 15:06:48 -0800 Subject: sh64: kernel: remove useless variable 'regs' Remove useless variable 'regs' to avoid the related warnings (warning is treated as error). Signed-off-by: Chen Gang Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/process_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index eac5947ac063..e2062e643341 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -374,7 +374,7 @@ asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, struct task_struct *p) { - struct pt_regs *childregs, *regs = current_pt_regs(); + struct pt_regs *childregs; #ifdef CONFIG_SH_FPU /* can't happen for a kernel thread */ -- cgit v1.2.3 From 72a0c5571351f5184195754d23db3e14495b2080 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 12 Nov 2013 15:06:49 -0800 Subject: cris: media platform drivers: fix build On cris arch, the functions below aren't defined: drivers/media/platform/sh_veu.c: In function 'sh_veu_reg_read': drivers/media/platform/sh_veu.c:228:2: error: implicit declaration of function 'ioread32' [-Werror=implicit-function-declaration] drivers/media/platform/sh_veu.c: In function 'sh_veu_reg_write': drivers/media/platform/sh_veu.c:234:2: error: implicit declaration of function 'iowrite32' [-Werror=implicit-function-declaration] drivers/media/platform/vsp1/vsp1.h: In function 'vsp1_read': drivers/media/platform/vsp1/vsp1.h:66:2: error: implicit declaration of function 'ioread32' [-Werror=implicit-function-declaration] drivers/media/platform/vsp1/vsp1.h: In function 'vsp1_write': drivers/media/platform/vsp1/vsp1.h:71:2: error: implicit declaration of function 'iowrite32' [-Werror=implicit-function-declaration] drivers/media/platform/vsp1/vsp1.h: In function 'vsp1_read': drivers/media/platform/vsp1/vsp1.h:66:2: error: implicit declaration of function 'ioread32' [-Werror=implicit-function-declaration] drivers/media/platform/vsp1/vsp1.h: In function 'vsp1_write': drivers/media/platform/vsp1/vsp1.h:71:2: error: implicit declaration of function 'iowrite32' [-Werror=implicit-function-declaration] drivers/media/platform/soc_camera/rcar_vin.c: In function 'rcar_vin_setup': drivers/media/platform/soc_camera/rcar_vin.c:284:3: error: implicit declaration of function 'iowrite32' [-Werror=implicit-function-declaration] drivers/media/platform/soc_camera/rcar_vin.c: In function 'rcar_vin_request_capture_stop': drivers/media/platform/soc_camera/rcar_vin.c:353:2: error: implicit declaration of function 'ioread32' [-Werror=implicit-function-declaration] Yet, they're available, as CONFIG_GENERIC_IOMAP is defined. What happens is that asm/io.h was not including asm-generic/iomap.h. Suggested-by: Ben Hutchings Signed-off-by: Mauro Carvalho Chehab Cc: Mikael Starvik Cc: Jesper Nilsson Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/include/asm/io.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/cris/include/asm/io.h b/arch/cris/include/asm/io.h index 5d3047e5563b..4353cf239a13 100644 --- a/arch/cris/include/asm/io.h +++ b/arch/cris/include/asm/io.h @@ -3,6 +3,7 @@ #include /* for __va, __pa */ #include +#include #include struct cris_io_operations -- cgit v1.2.3 From c1ce4b375fa7b76d8b553d4f8d6cc5a09f063691 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 12 Nov 2013 15:07:13 -0800 Subject: mm/arch: use __free_reserved_page() to simplify the code Use __free_reserved_page() to simplify the code in arch. It used split_page() in consistent_alloc()/__dma_alloc_coherent()/dma_alloc_coherent(), so page->_count == 1, and we can free it safely. __free_reserved_page() ClearPageReserved() init_page_count() // it won't change the value __free_page() Signed-off-by: Xishi Qiu Cc: James Hogan Cc: Michal Simek Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/metag/kernel/dma.c | 4 +--- arch/microblaze/mm/consistent.c | 7 ++----- arch/powerpc/mm/dma-noncoherent.c | 4 +--- 3 files changed, 4 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/metag/kernel/dma.c b/arch/metag/kernel/dma.c index 8c00dedadc54..db589ad5dbc4 100644 --- a/arch/metag/kernel/dma.c +++ b/arch/metag/kernel/dma.c @@ -305,9 +305,7 @@ void dma_free_coherent(struct device *dev, size_t size, if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - ClearPageReserved(page); - - __free_page(page); + __free_reserved_page(page); continue; } } diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index 5226b09cbbb2..dbbf2246a260 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -176,8 +176,7 @@ void consistent_free(size_t size, void *vaddr) page = virt_to_page(vaddr); do { - ClearPageReserved(page); - __free_page(page); + __free_reserved_page(page); page++; } while (size -= PAGE_SIZE); #else @@ -194,9 +193,7 @@ void consistent_free(size_t size, void *vaddr) pte_clear(&init_mm, (unsigned int)vaddr, ptep); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - - ClearPageReserved(page); - __free_page(page); + __free_reserved_page(page); } } vaddr += PAGE_SIZE; diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 6747eece84af..7b6c10750179 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -287,9 +287,7 @@ void __dma_free_coherent(size_t size, void *vaddr) pte_clear(&init_mm, addr, ptep); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - - ClearPageReserved(page); - __free_page(page); + __free_reserved_page(page); } } addr += PAGE_SIZE; -- cgit v1.2.3 From c69ded84a968e8ecc529b4de68522e4a2dcbf92a Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 12 Nov 2013 15:07:15 -0800 Subject: mm: remove obsolete comments about page table lock The callers of free_pgd_range() and hugetlb_free_pgd_range() don't hold page table locks. The comments seems to be obsolete, so let's remove them. Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 2 -- mm/memory.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d67db4bd672d..90bb6d9409bf 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -633,8 +633,6 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, /* * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. */ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, diff --git a/mm/memory.c b/mm/memory.c index 1f2287eaa88e..15744b2cf919 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, /* * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, -- cgit v1.2.3 From 6408068ee6ce9d80d66908a2c01a24ee88afd47d Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 12 Nov 2013 15:07:17 -0800 Subject: mm: use pgdat_end_pfn() to simplify the code in arch Use "pgdat_end_pfn()" instead of "pgdat->node_start_pfn + pgdat->node_spanned_pages". Simplify the code, no functional change. Signed-off-by: Xishi Qiu Cc: James Hogan Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 +--- arch/metag/mm/init.c | 2 +- arch/powerpc/mm/numa.c | 3 +-- arch/sh/mm/init.c | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b6f7f43424ec..88504abf5704 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -357,9 +357,7 @@ int vmemmap_find_next_valid_pfn(int node, int i) end_address = (unsigned long) &vmem_map[pgdat->node_start_pfn + i]; end_address = PAGE_ALIGN(end_address); - - stop_address = (unsigned long) &vmem_map[ - pgdat->node_start_pfn + pgdat->node_spanned_pages]; + stop_address = (unsigned long) &vmem_map[pgdat_end_pfn(pgdat)]; do { pgd_t *pgd; diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index 249fff66add3..3cd6288f65c2 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -148,7 +148,7 @@ static void __init bootmem_init_one_node(unsigned int nid) if (!p->node_spanned_pages) return; - end_pfn = p->node_start_pfn + p->node_spanned_pages; + end_pfn = pgdat_end_pfn(p); #ifdef CONFIG_HIGHMEM if (end_pfn > max_low_pfn) end_pfn = max_low_pfn; diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 33d67844062c..078d3e00a616 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -938,8 +938,7 @@ static void __init mark_reserved_regions_for_nid(int nid) unsigned long start_pfn = physbase >> PAGE_SHIFT; unsigned long end_pfn = PFN_UP(physbase + size); struct node_active_region node_ar; - unsigned long node_end_pfn = node->node_start_pfn + - node->node_spanned_pages; + unsigned long node_end_pfn = pgdat_end_pfn(node); /* * Check to make sure that this memblock.reserved area is diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 33890fd267cb..2d089fe2cba9 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -231,7 +231,7 @@ static void __init bootmem_init_one_node(unsigned int nid) if (!p->node_spanned_pages) return; - end_pfn = p->node_start_pfn + p->node_spanned_pages; + end_pfn = pgdat_end_pfn(p); total_pages = bootmem_bootmap_pages(p->node_spanned_pages); -- cgit v1.2.3 From 40c3baa7c66f1352521378ee83509fb8f4c465de Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Tue, 12 Nov 2013 15:07:27 -0800 Subject: mm/arch: use NUMA_NO_NODE Use more appropriate NUMA_NO_NODE instead of -1 in all archs' module_alloc() Signed-off-by: Jianguo Wu Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/module.c | 2 +- arch/arm64/kernel/module.c | 2 +- arch/parisc/kernel/module.c | 2 +- arch/s390/kernel/module.c | 2 +- arch/sparc/kernel/module.c | 2 +- arch/x86/kernel/module.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 084dc8896986..c9dfff3b8008 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -40,7 +40,7 @@ void *module_alloc(unsigned long size) { return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, -1, + GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE, __builtin_return_address(0)); } #endif diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 2c28a6cf93e6..e2ad0d87721f 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -29,7 +29,7 @@ void *module_alloc(unsigned long size) { return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, -1, + GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index 2a625fb063e1..50dfafc3f2c1 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -219,7 +219,7 @@ void *module_alloc(unsigned long size) * init_data correctly */ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_RWX, -1, + PAGE_KERNEL_RWX, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 7845e15a17df..b89b59158b95 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -50,7 +50,7 @@ void *module_alloc(unsigned long size) if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL, -1, + GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } #endif diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index 4435488ebe25..97655e0fd243 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -29,7 +29,7 @@ static void *module_map(unsigned long size) if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL, -1, + GFP_KERNEL, PAGE_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } #else diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 216a4d754b0c..18be189368bb 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -49,7 +49,7 @@ void *module_alloc(unsigned long size) return NULL; return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, - -1, __builtin_return_address(0)); + NUMA_NO_NODE, __builtin_return_address(0)); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 7aba842f08bb9e8485ff40067dc090ada4c7f575 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Nov 2013 15:07:55 -0800 Subject: s390/mmap: randomize mmap base for bottom up direction Implement mmap base randomization for the bottom up direction, so ASLR works for both mmap layouts on s390. See also commit df54d6fa5427 ("x86 get_unmapped_area(): use proper mmap base for bottom-up direction"). Signed-off-by: Heiko Carstens Cc: Radu Caragea Cc: Michel Lespinasse Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Metcalf Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/mmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 6bcb045d2bd2..9b436c21195e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -64,6 +64,11 @@ static unsigned long mmap_rnd(void) return (get_random_int() & 0x7ffUL) << PAGE_SHIFT; } +static unsigned long mmap_base_legacy(void) +{ + return TASK_UNMAPPED_BASE + mmap_rnd(); +} + static inline unsigned long mmap_base(void) { unsigned long gap = rlimit(RLIMIT_STACK); @@ -89,7 +94,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * bit is set, or if the expected stack growth is unlimited: */ if (mmap_is_legacy()) { - mm->mmap_base = TASK_UNMAPPED_BASE; + mm->mmap_base = mmap_base_legacy(); mm->get_unmapped_area = arch_get_unmapped_area; } else { mm->mmap_base = mmap_base(); @@ -164,7 +169,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * bit is set, or if the expected stack growth is unlimited: */ if (mmap_is_legacy()) { - mm->mmap_base = TASK_UNMAPPED_BASE; + mm->mmap_base = mmap_base_legacy(); mm->get_unmapped_area = s390_get_unmapped_area; } else { mm->mmap_base = mmap_base(); -- cgit v1.2.3 From 0167d7d8b0beb4cf12076b47e4dc73897ae5acb0 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 12 Nov 2013 15:08:02 -0800 Subject: x86/mm: factor out of top-down direct mapping setup Create a new function memory_map_top_down to factor out of the top-down direct memory mapping pagetable setup. This is also a preparation for the following patch, which will introduce the bottom-up memory mapping. That said, we will put the two ways of pagetable setup into separate functions, and choose to use which way in init_mem_mapping, which makes the code more clear. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Acked-by: Tejun Heo Acked-by: Toshi Kani Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Wanpeng Li Cc: Thomas Renninger Cc: Yinghai Lu Cc: Jiang Liu Cc: Wen Congyang Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init.c | 59 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ce32017c5e38..742d6d4ad9eb 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -418,27 +418,27 @@ static unsigned long __init get_new_step_size(unsigned long step_size) return step_size << 5; } -void __init init_mem_mapping(void) +/** + * memory_map_top_down - Map [map_start, map_end) top down + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in top-down. That said, the page tables + * will be allocated at the end of the memory, and we map the + * memory in top-down. + */ +static void __init memory_map_top_down(unsigned long map_start, + unsigned long map_end) { - unsigned long end, real_end, start, last_start; + unsigned long real_end, start, last_start; unsigned long step_size; unsigned long addr; unsigned long mapped_ram_size = 0; unsigned long new_mapped_ram_size; - probe_page_size_mask(); - -#ifdef CONFIG_X86_64 - end = max_pfn << PAGE_SHIFT; -#else - end = max_low_pfn << PAGE_SHIFT; -#endif - - /* the ISA range is always mapped regardless of memory holes */ - init_memory_mapping(0, ISA_END_ADDRESS); - /* xen has big range in reserved near end of ram, skip it at first.*/ - addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE); + addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ @@ -453,13 +453,13 @@ void __init init_mem_mapping(void) * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages * for page table. */ - while (last_start > ISA_END_ADDRESS) { + while (last_start > map_start) { if (last_start > step_size) { start = round_down(last_start - 1, step_size); - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; + if (start < map_start) + start = map_start; } else - start = ISA_END_ADDRESS; + start = map_start; new_mapped_ram_size = init_range_memory_mapping(start, last_start); last_start = start; @@ -470,8 +470,27 @@ void __init init_mem_mapping(void) mapped_ram_size += new_mapped_ram_size; } - if (real_end < end) - init_range_memory_mapping(real_end, end); + if (real_end < map_end) + init_range_memory_mapping(real_end, map_end); +} + +void __init init_mem_mapping(void) +{ + unsigned long end; + + probe_page_size_mask(); + +#ifdef CONFIG_X86_64 + end = max_pfn << PAGE_SHIFT; +#else + end = max_low_pfn << PAGE_SHIFT; +#endif + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + /* setup direct mapping for range [ISA_END_ADDRESS, end) in top-down*/ + memory_map_top_down(ISA_END_ADDRESS, end); #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { -- cgit v1.2.3 From b959ed6c73845aebf51afb8f76bb74b9388344d2 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 12 Nov 2013 15:08:05 -0800 Subject: x86/mem-hotplug: support initialize page tables in bottom-up The Linux kernel cannot migrate pages used by the kernel. As a result, kernel pages cannot be hot-removed. So we cannot allocate hotpluggable memory for the kernel. In a memory hotplug system, any numa node the kernel resides in should be unhotpluggable. And for a modern server, each node could have at least 16GB memory. So memory around the kernel image is highly likely unhotpluggable. ACPI SRAT (System Resource Affinity Table) contains the memory hotplug info. But before SRAT is parsed, memblock has already started to allocate memory for the kernel. So we need to prevent memblock from doing this. So direct memory mapping page tables setup is the case. init_mem_mapping() is called before SRAT is parsed. To prevent page tables being allocated within hotpluggable memory, we will use bottom-up direction to allocate page tables from the end of kernel image to the higher memory. Note: As for allocating page tables in lower memory, TJ said: : This is an optional behavior which is triggered by a very specific kernel : boot param, which I suspect is gonna need to stick around to support : memory hotplug in the current setup unless we add another layer of address : translation to support memory hotplug. As for page tables may occupy too much lower memory if using 4K mapping (CONFIG_DEBUG_PAGEALLOC and CONFIG_KMEMCHECK both disable using >4k pages), TJ said: : But as I said in the same paragraph, parsing SRAT earlier doesn't solve : the problem in itself either. Ignoring the option if 4k mapping is : required and memory consumption would be prohibitive should work, no? : Something like that would be necessary if we're gonna worry about cases : like this no matter how we implement it, but, frankly, I'm not sure this : is something worth worrying about. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Acked-by: Tejun Heo Acked-by: Toshi Kani Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Wanpeng Li Cc: Thomas Renninger Cc: Yinghai Lu Cc: Jiang Liu Cc: Wen Congyang Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 742d6d4ad9eb..91b522072a4d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -474,6 +474,51 @@ static void __init memory_map_top_down(unsigned long map_start, init_range_memory_mapping(real_end, map_end); } +/** + * memory_map_bottom_up - Map [map_start, map_end) bottom up + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in bottom-up. Since we have limited the + * bottom-up allocation above the kernel, the page tables will + * be allocated just above the kernel and we map the memory + * in [map_start, map_end) in bottom-up. + */ +static void __init memory_map_bottom_up(unsigned long map_start, + unsigned long map_end) +{ + unsigned long next, new_mapped_ram_size, start; + unsigned long mapped_ram_size = 0; + /* step_size need to be small so pgt_buf from BRK could cover it */ + unsigned long step_size = PMD_SIZE; + + start = map_start; + min_pfn_mapped = start >> PAGE_SHIFT; + + /* + * We start from the bottom (@map_start) and go to the top (@map_end). + * The memblock_find_in_range() gets us a block of RAM from the + * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages + * for page table. + */ + while (start < map_end) { + if (map_end - start > step_size) { + next = round_up(start + 1, step_size); + if (next > map_end) + next = map_end; + } else + next = map_end; + + new_mapped_ram_size = init_range_memory_mapping(start, next); + start = next; + + if (new_mapped_ram_size > mapped_ram_size) + step_size = get_new_step_size(step_size); + mapped_ram_size += new_mapped_ram_size; + } +} + void __init init_mem_mapping(void) { unsigned long end; @@ -489,8 +534,25 @@ void __init init_mem_mapping(void) /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); - /* setup direct mapping for range [ISA_END_ADDRESS, end) in top-down*/ - memory_map_top_down(ISA_END_ADDRESS, end); + /* + * If the allocation is in bottom-up direction, we setup direct mapping + * in bottom-up, otherwise we setup direct mapping in top-down. + */ + if (memblock_bottom_up()) { + unsigned long kernel_end = __pa_symbol(_end); + + /* + * we need two separate calls here. This is because we want to + * allocate page tables above the kernel. So we first map + * [kernel_end, end) to make memory above the kernel be mapped + * as soon as possible. And then use page tables allocated above + * the kernel to map [ISA_END_ADDRESS, kernel_end). + */ + memory_map_bottom_up(kernel_end, end); + memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); + } else { + memory_map_top_down(ISA_END_ADDRESS, end); + } #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { -- cgit v1.2.3 From fa591c4ae76ecbd4d26d7e8f65429d6d454554a6 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 12 Nov 2013 15:08:07 -0800 Subject: x86, acpi, crash, kdump: do reserve_crashkernel() after SRAT is parsed. Memory reserved for crashkernel could be large. So we should not allocate this memory bottom up from the end of kernel image. When SRAT is parsed, we will be able to know which memory is hotpluggable, and we can avoid allocating this memory for the kernel. So reorder reserve_crashkernel() after SRAT is parsed. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Acked-by: Tejun Heo Acked-by: Toshi Kani Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Wanpeng Li Cc: Thomas Renninger Cc: Yinghai Lu Cc: Jiang Liu Cc: Wen Congyang Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 918d489fa53d..cb233bc9dee3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1121,8 +1121,6 @@ void __init setup_arch(char **cmdline_p) acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start); #endif - reserve_crashkernel(); - vsmp_init(); io_delay_init(); @@ -1135,6 +1133,13 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); initmem_init(); + + /* + * Reserve memory for crash kernel after SRAT is parsed so that it + * won't consume hotpluggable memory. + */ + reserve_crashkernel(); + memblock_find_dma_reserve(); #ifdef CONFIG_KVM_GUEST -- cgit v1.2.3 From c5320926e370b4cfb8f10c2169e26f960079cf67 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 12 Nov 2013 15:08:10 -0800 Subject: mem-hotplug: introduce movable_node boot option The hot-Pluggable field in SRAT specifies which memory is hotpluggable. As we mentioned before, if hotpluggable memory is used by the kernel, it cannot be hot-removed. So memory hotplug users may want to set all hotpluggable memory in ZONE_MOVABLE so that the kernel won't use it. Memory hotplug users may also set a node as movable node, which has ZONE_MOVABLE only, so that the whole node can be hot-removed. But the kernel cannot use memory in ZONE_MOVABLE. By doing this, the kernel cannot use memory in movable nodes. This will cause NUMA performance down. And other users may be unhappy. So we need a way to allow users to enable and disable this functionality. In this patch, we introduce movable_node boot option to allow users to choose to not to consume hotpluggable memory at early boot time and later we can set it as ZONE_MOVABLE. To achieve this, the movable_node boot option will control the memblock allocation direction. That said, after memblock is ready, before SRAT is parsed, we should allocate memory near the kernel image as we explained in the previous patches. So if movable_node boot option is set, the kernel does the following: 1. After memblock is ready, make memblock allocate memory bottom up. 2. After SRAT is parsed, make memblock behave as default, allocate memory top down. Users can specify "movable_node" in kernel commandline to enable this functionality. For those who don't use memory hotplug or who don't want to lose their NUMA performance, just don't specify anything. The kernel will work as before. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Suggested-by: Kamezawa Hiroyuki Suggested-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Toshi Kani Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Wanpeng Li Cc: Thomas Renninger Cc: Yinghai Lu Cc: Jiang Liu Cc: Wen Congyang Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 3 +++ arch/x86/mm/numa.c | 11 +++++++++++ mm/Kconfig | 17 ++++++++++++----- mm/memory_hotplug.c | 31 +++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index fd3ecedc084d..882a40d405c8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1775,6 +1775,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that the amount of memory usable for all allocations is not too small. + movable_node [KNL,X86] Boot-time switch to enable the effects + of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details. + MTD_Partition= [MTD] Format: ,,, diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8bf93bae1f13..24aec58d6afd 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -567,6 +567,17 @@ static int __init numa_init(int (*init_func)(void)) ret = init_func(); if (ret < 0) return ret; + + /* + * We reset memblock back to the top-down direction + * here because if we configured ACPI_NUMA, we have + * parsed SRAT in init_func(). It is ok to have the + * reset here even if we did't configure ACPI_NUMA + * or acpi numa init fails and fallbacks to dummy + * numa init. + */ + memblock_set_bottom_up(false); + ret = numa_cleanup_meminfo(&numa_meminfo); if (ret < 0) return ret; diff --git a/mm/Kconfig b/mm/Kconfig index 394838f489eb..3f4ffda152bb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,11 +153,18 @@ config MOVABLE_NODE help Allow a node to have only movable memory. Pages used by the kernel, such as direct mapping pages cannot be migrated. So the corresponding - memory device cannot be hotplugged. This option allows users to - online all the memory of a node as movable memory so that the whole - node can be hotplugged. Users who don't use the memory hotplug - feature are fine with this option on since they don't online memory - as movable. + memory device cannot be hotplugged. This option allows the following + two things: + - When the system is booting, node full of hotpluggable memory can + be arranged to have only movable memory so that the whole node can + be hot-removed. (need movable_node boot option specified). + - After the system is up, the option allows users to online all the + memory of a node as movable memory so that the whole node can be + hot-removed. + + Users who don't use the memory hotplug feature are fine with this + option on since they don't specify movable_node boot option or they + don't online memory as movable. Say Y here if you want to hotplug a whole node. Say N here if you want kernel to use memory on all nodes evenly. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1b6fe8ca71e6..489f235502db 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1422,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) } #endif /* CONFIG_MOVABLE_NODE */ +static int __init cmdline_parse_movable_node(char *p) +{ +#ifdef CONFIG_MOVABLE_NODE + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + memblock_set_bottom_up(true); +#else + pr_warn("movable_node option not supported\n"); +#endif + return 0; +} +early_param("movable_node", cmdline_parse_movable_node); + /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) -- cgit v1.2.3 From d4dd100f2ebb2bc9aca5378ad3cb333b6117069c Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Tue, 12 Nov 2013 15:08:28 -0800 Subject: arch/x86/mm/init.c: fix incorrect function name in alloc_low_pages() Signed-off-by: Zhi Yong Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 91b522072a4d..f97130618113 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -53,12 +53,12 @@ __ref void *alloc_low_pages(unsigned int num) if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { unsigned long ret; if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_page: ran out of memory"); + panic("alloc_low_pages: ran out of memory"); ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, PAGE_SIZE * num , PAGE_SIZE); if (!ret) - panic("alloc_low_page: can not alloc memory"); + panic("alloc_low_pages: can not alloc memory"); memblock_reserve(ret, PAGE_SIZE * num); pfn = ret >> PAGE_SHIFT; } else { -- cgit v1.2.3 From 0ca43435188b9f911c8efcdf10731f726142dda1 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 12 Nov 2013 15:08:40 -0800 Subject: errno.h: remove "NFS" from descriptions in comments glibc recently changed the error string for ESTALE to remove "NFS" - https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=96945714ec61951cc748da2b4b8a80cf02127ee9 from: [ERR_REMAP (ESTALE)] = N_("Stale NFS file handle"), to: [ERR_REMAP (ESTALE)] = N_("Stale file handle"), And some have expressed concern that the kernel's errno.h comments still refer to NFS. So make that change... note that this is a comment-only change, and has no functional difference. Signed-off-by: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/errno.h | 2 +- arch/mips/include/uapi/asm/errno.h | 2 +- arch/parisc/include/uapi/asm/errno.h | 2 +- arch/sparc/include/uapi/asm/errno.h | 2 +- drivers/staging/lustre/lustre/include/lustre/lustre_errno.h | 2 +- include/uapi/asm-generic/errno.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h index e5f29ca28180..17f92aa76b2f 100644 --- a/arch/alpha/include/uapi/asm/errno.h +++ b/arch/alpha/include/uapi/asm/errno.h @@ -43,7 +43,7 @@ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ -#define ESTALE 70 /* Stale NFS file handle */ +#define ESTALE 70 /* Stale file handle */ #define EREMOTE 71 /* Object is remote */ #define ENOLCK 77 /* No record locks available */ diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h index 31575e2fd1bd..02d645d7aa9a 100644 --- a/arch/mips/include/uapi/asm/errno.h +++ b/arch/mips/include/uapi/asm/errno.h @@ -102,7 +102,7 @@ #define EWOULDBLOCK EAGAIN /* Operation would block */ #define EALREADY 149 /* Operation already in progress */ #define EINPROGRESS 150 /* Operation now in progress */ -#define ESTALE 151 /* Stale NFS file handle */ +#define ESTALE 151 /* Stale file handle */ #define ECANCELED 158 /* AIO operation canceled */ /* diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h index 135ad6047e51..f3a8aa554841 100644 --- a/arch/parisc/include/uapi/asm/errno.h +++ b/arch/parisc/include/uapi/asm/errno.h @@ -37,7 +37,7 @@ #define EBADMSG 67 /* Not a data message */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ -#define ESTALE 70 /* Stale NFS file handle */ +#define ESTALE 70 /* Stale file handle */ #define EREMOTE 71 /* Object is remote */ #define EOVERFLOW 72 /* Value too large for defined data type */ diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h index c351aba997b7..20423e172853 100644 --- a/arch/sparc/include/uapi/asm/errno.h +++ b/arch/sparc/include/uapi/asm/errno.h @@ -40,7 +40,7 @@ #define EPROCLIM 67 /* SUNOS: Too many processes */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ -#define ESTALE 70 /* Stale NFS file handle */ +#define ESTALE 70 /* Stale file handle */ #define EREMOTE 71 /* Object is remote */ #define ENOSTR 72 /* Device not a stream */ #define ETIME 73 /* Timer expired */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h b/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h index 2870487dd286..35aefa2cdad1 100644 --- a/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h @@ -165,7 +165,7 @@ #define LUSTRE_EHOSTUNREACH 113 /* No route to host */ #define LUSTRE_EALREADY 114 /* Operation already in progress */ #define LUSTRE_EINPROGRESS 115 /* Operation now in progress */ -#define LUSTRE_ESTALE 116 /* Stale NFS file handle */ +#define LUSTRE_ESTALE 116 /* Stale file handle */ #define LUSTRE_EUCLEAN 117 /* Structure needs cleaning */ #define LUSTRE_ENOTNAM 118 /* Not a XENIX named type file */ #define LUSTRE_ENAVAIL 119 /* No XENIX semaphores available */ diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h index a1331ce50445..1e1ea6e6e7a5 100644 --- a/include/uapi/asm-generic/errno.h +++ b/include/uapi/asm-generic/errno.h @@ -86,7 +86,7 @@ #define EHOSTUNREACH 113 /* No route to host */ #define EALREADY 114 /* Operation already in progress */ #define EINPROGRESS 115 /* Operation now in progress */ -#define ESTALE 116 /* Stale NFS file handle */ +#define ESTALE 116 /* Stale file handle */ #define EUCLEAN 117 /* Structure needs cleaning */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ -- cgit v1.2.3 From 616c05d110bb4ef8203f49c9d2476874077c2f6a Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 12 Nov 2013 15:08:45 -0800 Subject: sh: move fpu_counter into ARCH specific thread_struct Only a couple of arches (sh/x86) use fpu_counter in task_struct so it can be moved out into ARCH specific thread_struct, reducing the size of task_struct for other arches. Compile tested sh defconfig + sh4-linux-gcc (4.6.3) Signed-off-by: Vineet Gupta Cc: Paul Mundt Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/fpu.h | 2 +- arch/sh/include/asm/processor_32.h | 10 ++++++++++ arch/sh/include/asm/processor_64.h | 10 ++++++++++ arch/sh/kernel/cpu/fpu.c | 2 +- arch/sh/kernel/process_32.c | 6 +++--- 5 files changed, 25 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/sh/include/asm/fpu.h b/arch/sh/include/asm/fpu.h index 06c4281aab65..09fc2bc8a790 100644 --- a/arch/sh/include/asm/fpu.h +++ b/arch/sh/include/asm/fpu.h @@ -46,7 +46,7 @@ static inline void __unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs) save_fpu(tsk); release_fpu(regs); } else - tsk->fpu_counter = 0; + tsk->thread.fpu_counter = 0; } static inline void unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs) diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index e699a12cdcca..18e0377f72bb 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -111,6 +111,16 @@ struct thread_struct { /* Extended processor state */ union thread_xstate *xstate; + + /* + * fpu_counter contains the number of consecutive context switches + * that the FPU is used. If this is over a threshold, the lazy fpu + * saving becomes unlazy to save the trap. This is an unsigned char + * so that after 256 times the counter wraps and the behavior turns + * lazy again; this to deal with bursty apps that only use FPU for + * a short time + */ + unsigned char fpu_counter; }; #define INIT_THREAD { \ diff --git a/arch/sh/include/asm/processor_64.h b/arch/sh/include/asm/processor_64.h index 1cc7d3197143..eedd4f625d07 100644 --- a/arch/sh/include/asm/processor_64.h +++ b/arch/sh/include/asm/processor_64.h @@ -126,6 +126,16 @@ struct thread_struct { /* floating point info */ union thread_xstate *xstate; + + /* + * fpu_counter contains the number of consecutive context switches + * that the FPU is used. If this is over a threshold, the lazy fpu + * saving becomes unlazy to save the trap. This is an unsigned char + * so that after 256 times the counter wraps and the behavior turns + * lazy again; this to deal with bursty apps that only use FPU for + * a short time + */ + unsigned char fpu_counter; }; #define INIT_MMAP \ diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c index f8f7af51c128..4e332244ea75 100644 --- a/arch/sh/kernel/cpu/fpu.c +++ b/arch/sh/kernel/cpu/fpu.c @@ -44,7 +44,7 @@ void __fpu_state_restore(void) restore_fpu(tsk); task_thread_info(tsk)->status |= TS_USEDFPU; - tsk->fpu_counter++; + tsk->thread.fpu_counter++; } void fpu_state_restore(struct pt_regs *regs) diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index ebd3933005b4..2885fc9d9dcd 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, #endif ti->addr_limit = KERNEL_DS; ti->status &= ~TS_USEDFPU; - p->fpu_counter = 0; + p->thread.fpu_counter = 0; return 0; } *childregs = *current_pt_regs(); @@ -189,7 +189,7 @@ __switch_to(struct task_struct *prev, struct task_struct *next) unlazy_fpu(prev, task_pt_regs(prev)); /* we're going to use this soon, after a few expensive things */ - if (next->fpu_counter > 5) + if (next->thread.fpu_counter > 5) prefetch(next_t->xstate); #ifdef CONFIG_MMU @@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev, struct task_struct *next) * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now */ - if (next->fpu_counter > 5) + if (next->thread.fpu_counter > 5) __fpu_state_restore(); return prev; -- cgit v1.2.3 From c375f15a434db1867cb004bafba92aba739e4e39 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 12 Nov 2013 15:08:46 -0800 Subject: x86: move fpu_counter into ARCH specific thread_struct Only a couple of arches (sh/x86) use fpu_counter in task_struct so it can be moved out into ARCH specific thread_struct, reducing the size of task_struct for other arches. Compile tested i386_defconfig + gcc 4.7.3 Signed-off-by: Vineet Gupta Acked-by: Ingo Molnar Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/fpu-internal.h | 10 +++++----- arch/x86/include/asm/processor.h | 9 +++++++++ arch/x86/kernel/i387.c | 2 +- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/traps.c | 2 +- 6 files changed, 19 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 4d0bda7b11e3..c49a613c6452 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -365,7 +365,7 @@ static inline void drop_fpu(struct task_struct *tsk) * Forget coprocessor state.. */ preempt_disable(); - tsk->fpu_counter = 0; + tsk->thread.fpu_counter = 0; __drop_fpu(tsk); clear_used_math(); preempt_enable(); @@ -424,7 +424,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * or if the past 5 consecutive context-switches used math. */ fpu.preload = tsk_used_math(new) && (use_eager_fpu() || - new->fpu_counter > 5); + new->thread.fpu_counter > 5); if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) cpu = ~0; @@ -433,16 +433,16 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new->fpu_counter++; + new->thread.fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); } else if (!use_eager_fpu()) stts(); } else { - old->fpu_counter = 0; + old->thread.fpu_counter = 0; old->thread.fpu.last_cpu = ~0; if (fpu.preload) { - new->fpu_counter++; + new->thread.fpu_counter++; if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) fpu.preload = 0; else diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 987c75ecc334..7b034a4057f9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -488,6 +488,15 @@ struct thread_struct { unsigned long iopl; /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; + /* + * fpu_counter contains the number of consecutive context switches + * that the FPU is used. If this is over a threshold, the lazy fpu + * saving becomes unlazy to save the trap. This is an unsigned char + * so that after 256 times the counter wraps and the behavior turns + * lazy again; this to deal with bursty apps that only use FPU for + * a short time + */ + unsigned char fpu_counter; }; /* diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 5d576ab34403..e8368c6dd2a2 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -100,7 +100,7 @@ void unlazy_fpu(struct task_struct *tsk) __save_init_fpu(tsk); __thread_fpu_end(tsk); } else - tsk->fpu_counter = 0; + tsk->thread.fpu_counter = 0; preempt_enable(); } EXPORT_SYMBOL(unlazy_fpu); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c2ec1aa6d454..6f1236c29c4b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -153,7 +153,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, childregs->orig_ax = -1; childregs->cs = __KERNEL_CS | get_kernel_rpl(); childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; - p->fpu_counter = 0; + p->thread.fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); return 0; @@ -166,7 +166,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.ip = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); - p->fpu_counter = 0; + p->thread.fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 45ab4d6fc8a7..10fe4c189621 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -163,7 +163,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) childregs; p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); - p->fpu_counter = 0; + p->thread.fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 729aa779ff75..996ce2313ce6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -653,7 +653,7 @@ void math_state_restore(void) return; } - tsk->fpu_counter++; + tsk->thread.fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); -- cgit v1.2.3 From 66f119695bcccc2b03b354152f6d65a0b129cef9 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 12 Nov 2013 15:09:54 -0800 Subject: arch/arm/mach-davinci/sram.c: use gen_pool_dma_alloc() to sram.c Since gen_pool_dma_alloc() is introduced, we implement it to simplify code. Signed-off-by: Nicolin Chen Cc: Sekhar Nori Cc: Kevin Hilman Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-davinci/sram.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-davinci/sram.c b/arch/arm/mach-davinci/sram.c index f18928b073f5..8540dddf1fbd 100644 --- a/arch/arm/mach-davinci/sram.c +++ b/arch/arm/mach-davinci/sram.c @@ -25,7 +25,6 @@ struct gen_pool *sram_get_gen_pool(void) void *sram_alloc(size_t len, dma_addr_t *dma) { - unsigned long vaddr; dma_addr_t dma_base = davinci_soc_info.sram_dma; if (dma) @@ -33,13 +32,7 @@ void *sram_alloc(size_t len, dma_addr_t *dma) if (!sram_pool || (dma && !dma_base)) return NULL; - vaddr = gen_pool_alloc(sram_pool, len); - if (!vaddr) - return NULL; - - if (dma) - *dma = gen_pool_virt_to_phys(sram_pool, vaddr); - return (void *)vaddr; + return gen_pool_dma_alloc(sram_pool, len, dma); } EXPORT_SYMBOL(sram_alloc); -- cgit v1.2.3 From d049f74f2dbe71354d43d393ac3a188947811348 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Nov 2013 15:11:17 -0800 Subject: exec/ptrace: fix get_dumpable() incorrect tests The get_dumpable() return value is not boolean. Most users of the function actually want to be testing for non-SUID_DUMP_USER(1) rather than SUID_DUMP_DISABLE(0). The SUID_DUMP_ROOT(2) is also considered a protected state. Almost all places did this correctly, excepting the two places fixed in this patch. Wrong logic: if (dumpable == SUID_DUMP_DISABLE) { /* be protective */ } or if (dumpable == 0) { /* be protective */ } or if (!dumpable) { /* be protective */ } Correct logic: if (dumpable != SUID_DUMP_USER) { /* be protective */ } or if (dumpable != 1) { /* be protective */ } Without this patch, if the system had set the sysctl fs/suid_dumpable=2, a user was able to ptrace attach to processes that had dropped privileges to that user. (This may have been partially mitigated if Yama was enabled.) The macros have been moved into the file that declares get/set_dumpable(), which means things like the ia64 code can see them too. CVE-2013-2929 Reported-by: Vasily Kulikov Signed-off-by: Kees Cook Cc: "Luck, Tony" Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/processor.h | 2 +- fs/exec.c | 6 ++++++ include/linux/binfmts.h | 3 --- include/linux/sched.h | 4 ++++ kernel/ptrace.c | 3 ++- 5 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index e0a899a1a8a6..5a84b3a50741 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -319,7 +319,7 @@ struct thread_struct { regs->loadrs = 0; \ regs->r8 = get_dumpable(current->mm); /* set "don't zap registers" flag */ \ regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ - if (unlikely(!get_dumpable(current->mm))) { \ + if (unlikely(get_dumpable(current->mm) != SUID_DUMP_USER)) { \ /* \ * Zap scratch regs to avoid leaking bits between processes with different \ * uid/privileges. \ diff --git a/fs/exec.c b/fs/exec.c index 2ea437e5acf4..12120620f040 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1669,6 +1669,12 @@ int __get_dumpable(unsigned long mm_flags) return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret; } +/* + * This returns the actual value of the suid_dumpable flag. For things + * that are using this for checking for privilege transitions, it must + * test against SUID_DUMP_USER rather than treating it as a boolean + * value. + */ int get_dumpable(struct mm_struct *mm) { return __get_dumpable(mm->flags); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e8112ae50531..7554fd410bcc 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -99,9 +99,6 @@ extern void setup_new_exec(struct linux_binprm * bprm); extern void would_dump(struct linux_binprm *, struct file *); extern int suid_dumpable; -#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ -#define SUID_DUMP_USER 1 /* Dump as user of process */ -#define SUID_DUMP_ROOT 2 /* Dump as root */ /* Stack area protections */ #define EXSTACK_DEFAULT 0 /* Whatever the arch defaults to */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5e226fe3e512..f7efc8604652 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -323,6 +323,10 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); +#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ +#define SUID_DUMP_USER 1 /* Dump as user of process */ +#define SUID_DUMP_ROOT 2 /* Dump as root */ + /* mm flags */ /* dumpable bits */ #define MMF_DUMPABLE 0 /* core dump is permitted */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dd562e9aa2c8..1f4bcb3cc21c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -257,7 +257,8 @@ ok: if (task->mm) dumpable = get_dumpable(task->mm); rcu_read_lock(); - if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + if (dumpable != SUID_DUMP_USER && + !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { rcu_read_unlock(); return -EPERM; } -- cgit v1.2.3