From 59ea746337c69f6a5f1bc4d5e8544b3cbf12f801 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Jun 2008 13:56:40 +0200 Subject: MM: virtual address debug Add some (configurable) expensive sanity checking to catch wrong address translations on x86. - create linux/mmdebug.h file to be able include this file in asm headers to not get unsolvable loops in header files - __phys_addr on x86_32 became a function in ioremap.c since PAGE_OFFSET, is_vmalloc_addr and VMALLOC_* non-constasts are undefined if declared in page_32.h - add __phys_addr_const for initializing doublefault_tss.__cr3 Tested on 386, 386pae, x86_64 and x86_64 numa=fake=2. Contains Andi's enable numa virtual address debug patch. Signed-off-by: Jiri Slaby Cc: Andi Kleen Signed-off-by: Ingo Molnar --- mm/vmalloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e45b0f3d125..dc41e9c8ca6f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -180,6 +180,11 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) pmd_t *pmd; pte_t *ptep, pte; + /* XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space */ + VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && + !is_module_address(addr)); + if (!pgd_none(*pgd)) { pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { -- cgit v1.2.3 From 7aa413def76146f7b3784228556d9e4bc562eab3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 Jun 2008 13:28:11 +0200 Subject: x86, MM: virtual address debug, cleanups Signed-off-by: Ingo Molnar --- include/linux/mmdebug.h | 4 ++-- mm/vmalloc.c | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 860ed1a71bbe..8a5509877192 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -6,13 +6,13 @@ #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) #else -#define VM_BUG_ON(cond) do { } while(0) +#define VM_BUG_ON(cond) do { } while (0) #endif #ifdef CONFIG_DEBUG_VIRTUAL #define VIRTUAL_BUG_ON(cond) BUG_ON(cond) #else -#define VIRTUAL_BUG_ON(cond) do { } while(0) +#define VIRTUAL_BUG_ON(cond) do { } while (0) #endif #endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dc41e9c8ca6f..830a5580c5d7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -180,8 +180,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) pmd_t *pmd; pte_t *ptep, pte; - /* XXX we might need to change this if we add VIRTUAL_BUG_ON for - * architectures that do not vmalloc module space */ + /* + * XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space + */ VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && !is_module_address(addr)); -- cgit v1.2.3 From 5843d9a4d0ba89719916c8f07fc9c57b7126be6d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 1 Aug 2008 03:15:21 +0200 Subject: x86, pat: avoid highmem cache attribute aliasing Highmem code can leave ptes and tlb entries around for a given page even after kunmap, and after it has been freed. >From what I can gather, the PAT code may change the cache attributes of arbitrary physical addresses (ie. including highmem pages), which would result in aliases in the case that it operates on one of these lazy tlb highmem pages. Flushing kmaps should solve the problem. I've also just added code for conditional flushing if we haven't got any dangling highmem aliases -- this should help performance if we change page attributes frequently or systems that aren't using much highmem pages (eg. if < 4G RAM). Should be turned into 2 patches, but just for RFC... Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 3 +++ mm/highmem.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2c5c18c2464d..4adb33628dec 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -777,6 +777,9 @@ static int do_change_page_attr_set_clr(unsigned long addr, int numpages, WARN_ON_ONCE(1); } + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; diff --git a/mm/highmem.c b/mm/highmem.c index e16e1523b688..b36b83b920ff 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); static void flush_all_zero_pkmaps(void) { int i; + int need_flush = 0; flush_cache_kmaps(); @@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void) &pkmap_page_table[i]); set_page_address(page, NULL); + need_flush = 1; } - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); + if (need_flush) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } /** -- cgit v1.2.3 From 344c790e3821dac37eb742ddd0b611a300f78b9a Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Tue, 2 Sep 2008 14:35:38 -0700 Subject: mm: make setup_zone_migrate_reserve() aware of overlapping nodes I have gotten to the root cause of the hugetlb badness I reported back on August 15th. My system has the following memory topology (note the overlapping node): Node 0 Memory: 0x8000000-0x44000000 Node 1 Memory: 0x0-0x8000000 0x44000000-0x80000000 setup_zone_migrate_reserve() scans the address range 0x0-0x8000000 looking for a pageblock to move onto the MIGRATE_RESERVE list. Finding no candidates, it happily continues the scan into 0x8000000-0x44000000. When a pageblock is found, the pages are moved to the MIGRATE_RESERVE list on the wrong zone. Oops. setup_zone_migrate_reserve() should skip pageblocks in overlapping nodes. Signed-off-by: Adam Litke Acked-by: Mel Gorman Cc: Dave Hansen Cc: Nishanth Aravamudan Cc: Andy Whitcroft Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index af982f7cdb2a..feb791648a1f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -694,6 +694,9 @@ static int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); + if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; @@ -2516,6 +2519,10 @@ static void setup_zone_migrate_reserve(struct zone *zone) continue; page = pfn_to_page(pfn); + /* Watch out for overlapping nodes */ + if (page_to_nid(page) != zone_to_nid(zone)) + continue; + /* Blocks with reserved pages will never free, skip them. */ if (PageReserved(page)) continue; -- cgit v1.2.3 From 6ccfa806a9cfbbf1cd43d5b6aa47ef2c0eb518fd Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Tue, 2 Sep 2008 14:35:40 -0700 Subject: VFS: fix dio write returning EIO when try_to_release_page fails Dio write returns EIO when try_to_release_page fails because bh is still referenced. The patch commit 3f31fddfa26b7594b44ff2b34f9a04ba409e0f91 Author: Mingming Cao Date: Fri Jul 25 01:46:22 2008 -0700 jbd: fix race between free buffer and commit transaction was merged into 2.6.27-rc1, but I noticed that this patch is not enough to fix the race. I did fsstress test heavily to 2.6.27-rc1, and found that dio write still sometimes got EIO through this test. The patch above fixed race between freeing buffer(dio) and committing transaction(jbd) but I discovered that there is another race, freeing buffer(dio) and ext3/4_ordered_writepage. : background_writeout() ->write_cache_pages() ->ext3_ordered_writepage() walk_page_buffers() -> take a bh ref block_write_full_page() -> unlock_page : <- end_page_writeback : <- race! (dio write->try_to_release_page fails) walk_page_buffers() ->release a bh ref ext3_ordered_writepage holds bh ref and does unlock_page remaining taking a bh ref, so this causes the race and failure of try_to_release_page. To fix this race, I used the approach of falling back to buffered writes if try_to_release_page() fails on a page. [akpm@linux-foundation.org: cleanups] Signed-off-by: Hisashi Hifumi Cc: Chris Mason Cc: Jan Kara Cc: Mingming Cao Cc: Zach Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 +++++++++-- mm/truncate.c | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 54e968650855..876bc595d0f8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2129,13 +2129,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + * without clobbering -EIOCBQUEUED from ->direct_IO(). */ if (mapping->nrpages) { written = invalidate_inode_pages2_range(mapping, pos >> PAGE_CACHE_SHIFT, end); - if (written) + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; goto out; + } } written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); diff --git a/mm/truncate.c b/mm/truncate.c index 250505091d37..6650c1d878b4 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -380,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EIO if any pages could not be invalidated. + * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -440,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) - ret2 = -EIO; + ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; -- cgit v1.2.3 From 527655835ebac8f58a8f800a10700712a4c2affd Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Tue, 2 Sep 2008 14:35:41 -0700 Subject: mm/bootmem: silence section mismatch warning - contig_page_data/bootmem_node_data WARNING: vmlinux.o(.data+0x1f5c0): Section mismatch in reference from the variable contig_page_data to the variable .init.data:bootmem_node_data The variable contig_page_data references the variable __initdata bootmem_node_data If the reference is valid then annotate the variable with __init* (see linux/init.h) or name the variable: *driver, *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console, Signed-off-by: Marcin Slusarz Cc: Johannes Weiner Cc: Sean MacLennan Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index feb791648a1f..e293c58bea58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4071,7 +4071,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; EXPORT_SYMBOL(contig_page_data); #endif -- cgit v1.2.3 From b954185214c3b562c3fcc651e9ec69d421d76bfa Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 2 Sep 2008 14:35:58 -0700 Subject: mm: size of quicklists shouldn't be proportional to the number of CPUs Quicklists store pages for each CPU as caches. (Each CPU can cache node_free_pages/16 pages) It is used for page table cache. exit() will increase the cache size, while fork() consumes it. So for example if an apache-style application runs (one parent and many child model), one CPU process will fork() while another CPU will process the middleware work and exit(). At that time, the CPU on which the parent runs doesn't have page table cache at all. Others (on which children runs) have maximum caches. QList_max = (#ofCPUs - 1) x Free / 16 => QList_max / (Free + QList_max) = (#ofCPUs - 1) / (16 + #ofCPUs - 1) So, How much quicklist memory is used in the maximum case? This is proposional to # of CPUs because the limit of per cpu quicklist cache doesn't see the number of cpus. Above calculation mean Number of CPUs per node 2 4 8 16 ============================== ==================== QList_max / (Free + QList_max) 5.8% 16% 30% 48% Wow! Quicklist can spend about 50% memory at worst case. My demonstration program is here -------------------------------------------------------------------------------- #define _GNU_SOURCE #include #include #include #include #include #include #include #include #define BUFFSIZE 512 int max_cpu(void) /* get max number of logical cpus from /proc/cpuinfo */ { FILE *fd; char *ret, buffer[BUFFSIZE]; int cpu = 1; fd = fopen("/proc/cpuinfo", "r"); if (fd == NULL) { perror("fopen(/proc/cpuinfo)"); exit(EXIT_FAILURE); } while (1) { ret = fgets(buffer, BUFFSIZE, fd); if (ret == NULL) break; if (!strncmp(buffer, "processor", 9)) cpu = atoi(strchr(buffer, ':') + 2); } fclose(fd); return cpu; } void cpu_bind(int cpu) /* bind current process to one cpu */ { cpu_set_t mask; int ret; CPU_ZERO(&mask); CPU_SET(cpu, &mask); ret = sched_setaffinity(0, sizeof(mask), &mask); if (ret == -1) { perror("sched_setaffinity()"); exit(EXIT_FAILURE); } sched_yield(); /* not necessary */ } #define MMAP_SIZE (10 * 1024 * 1024) /* 10 MB */ #define FORK_INTERVAL 1 /* 1 second */ main(int argc, char *argv[]) { int cpu_max, nextcpu; long pagesize; pid_t pid; /* set max number of logical cpu */ if (argc > 1) cpu_max = atoi(argv[1]) - 1; else cpu_max = max_cpu(); /* get the page size */ pagesize = sysconf(_SC_PAGESIZE); if (pagesize == -1) { perror("sysconf(_SC_PAGESIZE)"); exit(EXIT_FAILURE); } /* prepare parent process */ cpu_bind(0); nextcpu = cpu_max; loop: /* select destination cpu for child process by round-robin rule */ if (++nextcpu > cpu_max) nextcpu = 1; pid = fork(); if (pid == 0) { /* child action */ char *p; int i; /* consume page tables */ p = mmap(0, MMAP_SIZE, PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); i = MMAP_SIZE / pagesize; while (i-- > 0) { *p = 1; p += pagesize; } /* move to other cpu */ cpu_bind(nextcpu); /* printf("a child moved to cpu%d after mmap().\n", nextcpu); fflush(stdout); */ /* back page tables to pgtable_quicklist */ exit(0); } else if (pid > 0) { /* parent action */ sleep(FORK_INTERVAL); waitpid(pid, NULL, WNOHANG); } goto loop; } ---------------------------------------- When above program which does task migration runs, my 8GB box spends 800MB of memory for quicklist. This is not memory leak but doesn't seem good. % cat /proc/meminfo MemTotal: 7701568 kB MemFree: 4724672 kB (snip) Quicklists: 844800 kB because - My machine spec is number of numa node: 2 number of cpus: 8 (4CPU x2 node) total mem: 8GB (4GB x2 node) free mem: about 5GB - Then, 4.7GB x 16% ~= 880MB. So, Quicklist can use 800MB. So, if following spec machine run that program CPUs: 64 (8cpu x 8node) Mem: 1TB (128GB x8node) Then, quicklist can waste 300GB (= 1TB x 30%). It is too large. So, I don't like cache policies which is proportional to # of cpus. My patch changes the number of caches from: per-cpu-cache-amount = memory_on_node / 16 to per-cpu-cache-amount = memory_on_node / 16 / number_of_cpus_on_node. Signed-off-by: KOSAKI Motohiro Cc: Keiichiro Tokunaga Acked-by: Christoph Lameter Tested-by: David Miller Acked-by: Mike Travis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/quicklist.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/quicklist.c b/mm/quicklist.c index 3f703f7cb398..8dbb6805ef35 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; static unsigned long max_pages(unsigned long min_pages) { unsigned long node_free_pages, max; - struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + int node = numa_node_id(); + struct zone *zones = NODE_DATA(node)->node_zones; + int num_cpus_on_node; + node_to_cpumask_ptr(cpumask_on_node, node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages) zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); max = node_free_pages / FRACTION_OF_NODE_MEM; + + num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + max /= num_cpus_on_node; + return max(max, min_pages); } -- cgit v1.2.3 From ce36394269ccd9d1d286d6192ba09fa6894365e9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 16:09:47 +0200 Subject: mmap: fix petty bug in anonymous shared mmap offset handling Anonymous mappings should ignore offset but shared anonymous mapping forgot to clear it and makes the following legit test program trigger SIGBUS. #include #include #include #define PAGE_SIZE 4096 int main(void) { char *p; int i; p = mmap(NULL, 2 * PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, PAGE_SIZE); if (p == MAP_FAILED) { perror("mmap"); return 1; } for (i = 0; i < 2; i++) { printf("page %d\n", i); p[i * 4096] = i; } return 0; } Fix it. Signed-off-by: Tejun Heo Acked-by: Hugh Dickins Acked-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 339cf5c4d5d8..e7a5a68a9c2e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1030,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + /* + * Ignore pgoff. + */ + pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: -- cgit v1.2.3 From 5bead2a0680687b9576d57c177988e8aa082b922 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 13 Sep 2008 02:33:19 -0700 Subject: mm: mark the correct zone as full when scanning zonelists The iterator for_each_zone_zonelist() uses a struct zoneref *z cursor when scanning zonelists to keep track of where in the zonelist it is. The zoneref that is returned corresponds to the the next zone that is to be scanned, not the current one. It was intended to be treated as an opaque list. When the page allocator is scanning a zonelist, it marks elements in the zonelist corresponding to zones that are temporarily full. As the zonelist is being updated, it uses the cursor here; if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); This is intended to prevent rescanning in the near future but the zoneref cursor does not correspond to the zone that has been found to be full. This is an easy misunderstanding to make so this patch corrects the problem by changing zoneref cursor to be the current zone being scanned instead of the next one. Signed-off-by: Mel Gorman Cc: Andy Whitcroft Cc: KAMEZAWA Hiroyuki Cc: [2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++------ mm/mmzone.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 443bc7cd8c62..428328a05fa1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -751,8 +751,9 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the - * search. The zoneref returned is a cursor that is used as the next starting - * point for future calls to next_zones_zonelist(). + * search. The zoneref returned is a cursor that represents the current zone + * being examined. It should be advanced by one before calling + * next_zones_zonelist again. */ struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, @@ -768,9 +769,8 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be - * used to iterate the zonelist with next_zones_zonelist. The cursor should - * not be used by the caller as it does not match the value of the zone - * returned. + * used to iterate the zonelist with next_zones_zonelist by advancing it by + * one before calling. */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, @@ -795,7 +795,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ zone; \ - z = next_zones_zonelist(z, highidx, nodemask, &zone)) \ + z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \ /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index diff --git a/mm/mmzone.c b/mm/mmzone.c index 486ed595ee6f..16ce8b955dcf 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, (z->zone && !zref_in_nodemask(z, nodes))) z++; - *zone = zonelist_zone(z++); + *zone = zonelist_zone(z); return z; } -- cgit v1.2.3 From 02b71b70129aaaa38f280af2aa5a767a4dec9107 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 11 Sep 2008 12:25:41 -0700 Subject: slub: fixed uninitialized counter in struct kmem_cache_node Initialized total objects atomic for the node in init_kmem_cache_node. The uninitialized value was ruining the stats in /proc/slabinfo. Acked-by: Christoph Lameter Signed-off-by: Salman Qazi Signed-off-by: Pekka Enberg --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index fb486d5540f8..0c83e6afe7b2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1932,6 +1932,7 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif } -- cgit v1.2.3 From db203d53d474aa068984e409d807628f5841da1b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 22 Sep 2008 13:57:50 -0700 Subject: mm: tiny-shmem fix lock ordering: mmap_sem vs i_mutex tiny-shmem calls do_truncate in shmem_file_setup. do_truncate takes i_mutex, and shmem_file_setup is called with mmap_sem held. However i_mutex nests outside mmap_sem. Copy the code in shmem.c to avoid this problem. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Reported-and-tested-by: Ingo Molnar Cc: Peter Zijlstra Cc: Matt Mackall Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/tiny-shmem.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index ae532f501943..d17cb6f6ab10 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -65,31 +65,25 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (!dentry) goto put_memory; - error = -ENOSPC; - inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); - if (!inode) - goto put_dentry; - - d_instantiate(dentry, inode); error = -ENFILE; - file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &ramfs_file_operations); + file = get_empty_filp(); if (!file) goto put_dentry; - inode->i_nlink = 0; /* It is unlinked */ - - /* notify everyone as to the change of file size */ - error = do_truncate(dentry, size, 0, file); - if (error < 0) + error = -ENOSPC; + inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) goto close_file; + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &ramfs_file_operations); return file; close_file: put_filp(file); - return ERR_PTR(error); - put_dentry: dput(dentry); put_memory: -- cgit v1.2.3 From a10cebf56ca7e7c034d1b6646230c6553e478967 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Mon, 22 Sep 2008 13:57:52 -0700 Subject: memcg: check under limit at shrink_usage Current memory cgroup(both in mainline and -mm) doesn't account swap caches as memory(swap cache support is dropped temporarily now). So try_to_free_mem_cgroup_pages doesn't reflect the count of pages that have been moved to swap cache. But this makes mem_cgroup_shrink_usage fail easily if most of the pages are anon/shmem, and then shmem_getpage returns -ENOMEM and the process will be killed. This patch adds res_counter_check_under_limit to avoid these cases. BTW, even if swap cache support is enabled again, if a process is moved to another cgroup, which has been just made, between precharge and shrink_usage in shmem_getpage, shrink_usage may fail just because there is no pages to reclaim. So this change would make sense anyway. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: Pavel Emelyanov Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f1f7a7374ba..c0500e4d3a2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -806,6 +806,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) do { progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + progress += res_counter_check_under_limit(&mem->res); } while (!progress && --retry); css_put(&mem->css); -- cgit v1.2.3 From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sun, 28 Sep 2008 23:09:31 +0100 Subject: mm owner: fix race between swapoff and exit There's a race between mm->owner assignment and swapoff, more easily seen when task slab poisoning is turned on. The condition occurs when try_to_unuse() runs in parallel with an exiting task. A similar race can occur with callers of get_task_mm(), such as /proc// or ptrace or page migration. CPU0 CPU1 try_to_unuse looks at mm = task0->mm increments mm->mm_users task 0 exits mm->owner needs to be updated, but no new owner is found (mm_users > 1, but no other task has task->mm = task0->mm) mm_update_next_owner() leaves mmput(mm) decrements mm->mm_users task0 freed dereferencing mm->owner fails The fix is to notify the subsystem via mm_owner_changed callback(), if no new owner is found, by specifying the new task as NULL. Jiri Slaby: mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but must be set after that, so as not to pass NULL as old owner causing oops. Daisuke Nishimura: mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task() and its callers need to take account of this situation to avoid oops. Hugh Dickins: Lockdep warning and hang below exec_mmap() when testing these patches. exit_mm() up_reads mmap_sem before calling mm_update_next_owner(), so exec_mmap() now needs to do the same. And with that repositioning, there's now no point in mm_need_new_owner() allowing for NULL mm. Reported-by: Hugh Dickins Signed-off-by: Balbir Singh Signed-off-by: Jiri Slaby Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- kernel/cgroup.c | 5 +++-- kernel/exit.c | 12 ++++++++++-- mm/memcontrol.c | 17 +++++++++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/fs/exec.c b/fs/exec.c index 32993beecbe9..cecee501ce78 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); - mm_update_next_owner(old_mm); arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); + mm_update_next_owner(old_mm); mmput(old_mm); return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13932abde159..a0123d75ec9a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { - struct cgroup *oldcgrp, *newcgrp; + struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); - newcgrp = task_cgroup(new, ss->subsys_id); + if (new) + newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) diff --git a/kernel/exit.c b/kernel/exit.c index 16395644a98f..85a83c831856 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -627,6 +625,16 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0500e4d3a2f..36896f3eb7f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); } @@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + kmem_cache_free(page_cgroup_cache, pc); + return 0; + } /* * For every charge from the cgroup, increment reference count */ @@ -801,6 +814,10 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } css_get(&mem->css); rcu_read_unlock(); -- cgit v1.2.3 From 6c1b7f680dd4f550fa6f91f148cc6fa2c4bd0737 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 2 Oct 2008 14:50:16 -0700 Subject: memory hotplug: missing zone->lock in test_pages_isolated() __test_page_isolated_in_pageblock() in mm/page_isolation.c has a comment saying that the caller must hold zone->lock. But the only caller of that function, test_pages_isolated(), does not hold zone->lock and the lock is also not acquired anywhere before. This patch adds the missing zone->lock to test_pages_isolated(). We reproducibly run into BUG_ON(!PageBuddy(page)) in __offline_isolated_pages() during memory hotplug stress test, see trace below. This patch fixes that problem, it would be good if we could have it in 2.6.27. kernel BUG at /home/autobuild/BUILD/linux-2.6.26-20080909/mm/page_alloc.c:4561! illegal operation: 0001 [#1] PREEMPT SMP Modules linked in: dm_multipath sunrpc bonding qeth_l3 dm_mod qeth ccwgroup vmur CPU: 1 Not tainted 2.6.26-29.x.20080909-s390default #1 Process memory_loop_all (pid: 10025, task: 2f444028, ksp: 2b10dd28) Krnl PSW : 040c0000 801727ea (__offline_isolated_pages+0x18e/0x1c4) R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:0 CC:0 PM:0 Krnl GPRS: 00000000 7e27fc00 00000000 7e27fc00 00000000 00000400 00014000 7e27fc01 00606f00 7e27fc00 00013fe0 2b10dd28 00000005 80172662 801727b2 2b10dd28 Krnl Code: 801727de: 5810900c l %r1,12(%r9) 801727e2: a7f4ffb3 brc 15,80172748 801727e6: a7f40001 brc 15,801727e8 >801727ea: a7f4ffbc brc 15,80172762 801727ee: a7f40001 brc 15,801727f0 801727f2: a7f4ffaf brc 15,80172750 801727f6: 0707 bcr 0,%r7 801727f8: 0017 unknown Call Trace: ([<0000000000172772>] __offline_isolated_pages+0x116/0x1c4) [<00000000001953a2>] offline_isolated_pages_cb+0x22/0x34 [<000000000013164c>] walk_memory_resource+0xcc/0x11c [<000000000019520e>] offline_pages+0x36a/0x498 [<00000000001004d6>] remove_memory+0x36/0x44 [<000000000028fb06>] memory_block_change_state+0x112/0x150 [<000000000028ffb8>] store_mem_state+0x90/0xe4 [<0000000000289c00>] sysdev_store+0x34/0x40 [<00000000001ee048>] sysfs_write_file+0xd0/0x178 [<000000000019b1a8>] vfs_write+0x74/0x118 [<000000000019b9ae>] sys_write+0x46/0x7c [<000000000011160e>] sysc_do_restart+0x12/0x16 [<0000000077f3e8ca>] 0x77f3e8ca Signed-off-by: Gerald Schaefer Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c69f84fe038d..b70a7fec1ff6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -114,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, flags; struct page *page; + struct zone *zone; + int ret; pfn = start_pfn; /* @@ -131,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) if (pfn < end_pfn) return -EBUSY; /* Check all pages are free or Marked as ISOLATED */ - if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) - return 0; - return -EBUSY; + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + return ret ? 0 : -EBUSY; } -- cgit v1.2.3 From 4b19de6d1cb07c8bcb6778e771f9cfd5bcfdfd3e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 2 Oct 2008 14:50:16 -0700 Subject: mm: tiny-shmem nommu fix The previous patch db203d53d474aa068984e409d807628f5841da1b ("mm: tiny-shmem fix lock ordering: mmap_sem vs i_mutex") to fix the lock ordering in tiny-shmem breaks shared anonymous and IPC memory on NOMMU architectures because it was using the expanding truncate to signal ramfs to allocate a physically contiguous RAM backing the inode (otherwise it is unusable for "memory mapping" it to userspace). However do_truncate is what caused the lock ordering error, due to it taking i_mutex. In this case, we can actually just call ramfs directly to allocate memory for the mapping, rather than go via truncate. Acked-by: David Howells Acked-by: Hugh Dickins Signed-off-by: Nick Piggin Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 2 +- include/linux/ramfs.h | 1 + mm/tiny-shmem.c | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 52312ec93ff4..5145cb9125af 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = { * size 0 on the assumption that it's going to be used for an mmap of shared * memory */ -static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { struct pagevec lru_pvec; unsigned long npages, xpages, loop, limit; diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index b160fb18e8d6..37aaf2b39863 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -6,6 +6,7 @@ extern int ramfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt); #ifndef CONFIG_MMU +extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize); extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index d17cb6f6ab10..8d7a27a6335c 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -80,6 +80,12 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) inode->i_nlink = 0; /* It is unlinked */ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, &ramfs_file_operations); + +#ifndef CONFIG_MMU + error = ramfs_nommu_expand_for_mapping(inode, size); + if (error) + goto close_file; +#endif return file; close_file: -- cgit v1.2.3 From 6babc32c41e3642d875372cb6afbd9ade7a9f311 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 2 Oct 2008 14:50:18 -0700 Subject: mm: handle initialising compound pages at orders greater than MAX_ORDER When we initialise a compound page we initialise the page flags and head page pointer for all base pages spanned by that page. When we initialise a gigantic page (a page of order greater than or equal to MAX_ORDER) we have to initialise more than MAX_ORDER_NR_PAGES pages. Currently we assume that all elements of the mem_map in this page are contigious in memory. However this is only guarenteed out to MAX_ORDER_NR_PAGES pages, and with SPARSEMEM enabled they will not be contigious. This leads us to walk off the end of the first section and scribble on everything which follows, BAD. When we reach a MAX_ORDER_NR_PAGES boundary we much locate the next section of the mem_map. As gigantic pages can only be maximally aligned we know this will occur at exact multiple of MAX_ORDER_NR_PAGES pages from the start of the page. This is a bug fix for the gigantic page support in hugetlbfs. Credit to Mel Gorman for spotting the issue. Signed-off-by: Andy Whitcroft Cc: Mel Gorman Cc: Jon Tollefson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e293c58bea58..27b8681139fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -268,13 +268,14 @@ void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; set_compound_page_dtor(page, free_compound_page); set_compound_order(page, order); __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); __SetPageTail(p); p->first_page = page; } @@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; if (unlikely(compound_order(page) != order)) bad_page(page); @@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageHead(page))) bad_page(page); __ClearPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); if (unlikely(!PageTail(p) | (p->first_page != page))) -- cgit v1.2.3 From 85ba94ba0592296053f7f2846812173424afe1cb Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Tue, 7 Oct 2008 11:37:35 -0500 Subject: SLOB: fix bogus ksize calculation SLOB's ksize calculation was braindamaged and generally harmlessly underreported the allocation size. But for very small buffers, it could in fact overreport them, leading code depending on krealloc to overrun the allocation and trample other data. Signed-off-by: Matt Mackall Tested-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- mm/slob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 4c82dd41f32e..62b679dc660f 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -515,7 +515,7 @@ size_t ksize(const void *block) sp = (struct slob_page *)virt_to_page(block); if (slob_page(sp)) - return ((slob_t *)block - 1)->units + SLOB_UNIT; + return (((slob_t *)block - 1)->units - 1) * SLOB_UNIT; else return sp->page.private; } -- cgit v1.2.3 From 36144077bce9f89763ce994bc631cbd1c9db7785 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Aug 2008 13:12:15 +0200 Subject: highmem: use bio_has_data() in the bounce path Signed-off-by: Jens Axboe --- mm/bounce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bounce.c b/mm/bounce.c index b6d2d0f1019b..06722c403058 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * Data-less bio, nothing to bounce */ - if (bio_empty_barrier(*bio_orig)) + if (!bio_has_data(*bio_orig)) return; /* -- cgit v1.2.3 From 70096a561d1e09120bae1f293f3632cedbfd5c68 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Wed, 8 Oct 2008 14:51:57 -0500 Subject: SLOB: fix bogus ksize calculation fix This fixes the previous fix, which was completely wrong on closer inspection. This version has been manually tested with a user-space test harness and generates sane values. A nearly identical patch has been boot-tested. The problem arose from changing how kmalloc/kfree handled alignment padding without updating ksize to match. This brings it in sync. Signed-off-by: Matt Mackall Signed-off-by: Linus Torvalds --- mm/slob.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 62b679dc660f..cb675d126791 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -514,9 +514,11 @@ size_t ksize(const void *block) return 0; sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) - return (((slob_t *)block - 1)->units - 1) * SLOB_UNIT; - else + if (slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; + } else return sp->page.private; } -- cgit v1.2.3