From 046c68842bce6b77509cf56e94a561029124b0ce Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 5 Jan 2009 14:06:29 +0000 Subject: mm: update my address Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index d4855a682ab6..2c778fcfd9bd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3,7 +3,7 @@ * * Written by obz. * - * Address space accounting code + * Address space accounting code */ #include -- cgit v1.2.3 From 48aae42556e5ea1ba0d8ddab25352706577af2ed Mon Sep 17 00:00:00 2001 From: ZhenwenXu Date: Tue, 6 Jan 2009 14:40:21 -0800 Subject: mm/mmap.c: fix coding style Fix a little of the coding style in mm/mmap.c [akpm@linux-foundation.org: cleanup] Signed-off-by: ZhenwenXu Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 2c778fcfd9bd..e4507b23e620 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, static void __vma_link_file(struct vm_area_struct *vma) { - struct file * file; + struct file *file; file = vma->vm_file; if (file) { @@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, * insert vm structure into list and rbtree and anon_vma, * but it has already been inserted into prio_tree earlier. */ -static void -__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; + struct vm_area_struct *__vma, *prev; + struct rb_node **rb_link, *rb_parent; __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); BUG_ON(__vma && __vma->vm_start < vma->vm_end); @@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, * The caller must hold down_write(current->mm->mmap_sem). */ -unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff) { @@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = NULL; @@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma = NULL, *prev = NULL; - struct rb_node * rb_node; + struct rb_node *rb_node; if (!mm) goto out; @@ -1541,7 +1540,7 @@ out: * update accounting. This is shared with both the * grow-up and grow-down cases. */ -static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) +static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; struct rlimit *rlim = current->signal->rlim; -- cgit v1.2.3 From 901608d9045146aec6f14a7777ea4b1501c379f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Jan 2009 14:40:29 -0800 Subject: mm: introduce get_mm_hiwater_xxx(), fix taskstats->hiwater_xxx accounting xacct_add_tsk() relies on do_exit()->update_hiwater_xxx() and uses mm->hiwater_xxx directly, this leads to 2 problems: - taskstats_user_cmd() can call fill_pid()->xacct_add_tsk() at any moment before the task exits, so we should check the current values of rss/vm anyway. - do_exit()->update_hiwater_xxx() calls are racy. An exiting thread can be preempted right before mm->hiwater_xxx = new_val, and another thread can use A_LOT of memory and exit in between. When the first thread resumes it can be the last thread in the thread group, in that case we report the wrong hiwater_xxx values which do not take A_LOT into account. Introduce get_mm_hiwater_rss() and get_mm_hiwater_vm() helpers and change xacct_add_tsk() to use them. The first helper will also be used by rusage->ru_maxrss accounting. Kill do_exit()->update_hiwater_xxx() calls. Unless we are going to decrease rss/vm there is no point to update mm->hiwater_xxx, and nobody can look at this mm_struct when exit_mmap() actually unmaps the memory. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ kernel/exit.c | 5 +---- kernel/tsacct.c | 4 ++-- mm/mmap.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/mmap.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 38a3f4b15394..ea415136ac9e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -386,6 +386,9 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) +#define get_mm_hiwater_rss(mm) max((mm)->hiwater_rss, get_mm_rss(mm)) +#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm) + extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); diff --git a/kernel/exit.c b/kernel/exit.c index f923724ab3c9..c7740fa3252c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1051,10 +1051,7 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 2dc06ab35716..43f891b05a4b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; + stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar; diff --git a/mm/mmap.c b/mm/mmap.c index e4507b23e620..1f97d8aa9b05 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2102,7 +2102,7 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); - /* Don't update_hiwater_rss(mm) here, do_exit already did */ + /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); -- cgit v1.2.3 From dcd4a049b9751828c516c59709f3fdf50436df85 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Jan 2009 14:40:31 -0800 Subject: mm: check for no mmaps in exit_mmap() When dup_mmap() ooms we can end up with mm->mmap == NULL. The error path does mmput() and unmap_vmas() gets a NULL vma which it dereferences. In exit_mmap() there is nothing to do at all for this case, we can cancel the callpath right there. [akpm@linux-foundation.org: add sorely-needed comment] Signed-off-by: Johannes Weiner Reported-by: Akinobu Mita Cc: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 1f97d8aa9b05..a910c045cfd4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2090,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); mmu_notifier_release(mm); + if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ + return; + if (mm->locked_vm) { vma = mm->mmap; while (vma) { -- cgit v1.2.3 From 8feae13110d60cc6287afabc2887366b0eb226c2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Jan 2009 12:04:47 +0000 Subject: NOMMU: Make VMAs per MM as for MMU-mode linux Make VMAs per mm_struct as for MMU-mode linux. This solves two problems: (1) In SYSV SHM where nattch for a segment does not reflect the number of shmat's (and forks) done. (2) In mmap() where the VMA's vm_mm is set to point to the parent mm by an exec'ing process when VM_EXECUTABLE is specified, regardless of the fact that a VMA might be shared and already have its vm_mm assigned to another process or a dead process. A new struct (vm_region) is introduced to track a mapped region and to remember the circumstances under which it may be shared and the vm_list_struct structure is discarded as it's no longer required. This patch makes the following additional changes: (1) Regions are now allocated with alloc_pages() rather than kmalloc() and with no recourse to __GFP_COMP, so the pages are not composite. Instead, each page has a reference on it held by the region. Anything else that is interested in such a page will have to get a reference on it to retain it. When the pages are released due to unmapping, each page is passed to put_page() and will be freed when the page usage count reaches zero. (2) Excess pages are trimmed after an allocation as the allocation must be made as a power-of-2 quantity of pages. (3) VMAs are added to the parent MM's R/B tree and mmap lists. As an MM may end up with overlapping VMAs within the tree, the VMA struct address is appended to the sort key. (4) Non-anonymous VMAs are now added to the backing inode's prio list. (5) Holes may be punched in anonymous VMAs with munmap(), releasing parts of the backing region. The VMA and region structs will be split if necessary. (6) sys_shmdt() only releases one attachment to a SYSV IPC shared memory segment instead of all the attachments at that addresss. Multiple shmat()'s return the same address under NOMMU-mode instead of different virtual addresses as under MMU-mode. (7) Core dumping for ELF-FDPIC requires fewer exceptions for NOMMU-mode. (8) /proc/maps is now the global list of mapped regions, and may list bits that aren't actually mapped anywhere. (9) /proc/meminfo gains a line (tagged "MmapCopy") that indicates the amount of RAM currently allocated by mmap to hold mappable regions that can't be mapped directly. These are copies of the backing device or file if not anonymous. These changes make NOMMU mode more similar to MMU mode. The downside is that NOMMU mode requires some extra memory to track things over NOMMU without this patch (VMAs are no longer shared, and there are now region structs). Signed-off-by: David Howells Tested-by: Mike Frysinger Acked-by: Paul Mundt --- Documentation/nommu-mmap.txt | 18 +- arch/arm/include/asm/mmu.h | 1 - arch/blackfin/include/asm/mmu.h | 1 - arch/blackfin/kernel/ptrace.c | 6 +- arch/blackfin/kernel/traps.c | 11 +- arch/frv/kernel/ptrace.c | 11 +- arch/h8300/include/asm/mmu.h | 1 - arch/m68knommu/include/asm/mmu.h | 1 - arch/sh/include/asm/mmu.h | 1 - fs/binfmt_elf_fdpic.c | 27 +- fs/proc/internal.h | 2 - fs/proc/meminfo.c | 6 + fs/proc/nommu.c | 71 ++- fs/proc/task_nommu.c | 108 +++-- include/asm-frv/mmu.h | 1 - include/asm-m32r/mmu.h | 1 - include/linux/mm.h | 18 +- include/linux/mm_types.h | 18 +- ipc/shm.c | 12 + kernel/fork.c | 4 +- lib/Kconfig.debug | 7 + mm/mmap.c | 10 + mm/nommu.c | 960 +++++++++++++++++++++++++++------------ 23 files changed, 860 insertions(+), 436 deletions(-) (limited to 'mm/mmap.c') diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt index 7714f57caad5..02b89dcf38ac 100644 --- a/Documentation/nommu-mmap.txt +++ b/Documentation/nommu-mmap.txt @@ -109,12 +109,18 @@ and it's also much more restricted in the latter case: FURTHER NOTES ON NO-MMU MMAP ============================ - (*) A request for a private mapping of less than a page in size may not return - a page-aligned buffer. This is because the kernel calls kmalloc() to - allocate the buffer, not get_free_page(). - - (*) A list of all the mappings on the system is visible through /proc/maps in - no-MMU mode. + (*) A request for a private mapping of a file may return a buffer that is not + page-aligned. This is because XIP may take place, and the data may not be + paged aligned in the backing store. + + (*) A request for an anonymous mapping will always be page aligned. If + possible the size of the request should be a power of two otherwise some + of the space may be wasted as the kernel must allocate a power-of-2 + granule but will only discard the excess if appropriately configured as + this has an effect on fragmentation. + + (*) A list of all the private copy and anonymous mappings on the system is + visible through /proc/maps in no-MMU mode. (*) A list of all the mappings in use by a process is visible through /proc//maps in no-MMU mode. diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h index 53099d4ee421..b561584d04a1 100644 --- a/arch/arm/include/asm/mmu.h +++ b/arch/arm/include/asm/mmu.h @@ -24,7 +24,6 @@ typedef struct { * modified for 2.6 by Hyok S. Choi */ typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; } mm_context_t; diff --git a/arch/blackfin/include/asm/mmu.h b/arch/blackfin/include/asm/mmu.h index 757e43906ed4..dbfd686360e6 100644 --- a/arch/blackfin/include/asm/mmu.h +++ b/arch/blackfin/include/asm/mmu.h @@ -10,7 +10,6 @@ struct sram_list_struct { }; typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; unsigned long stack_start; diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index d2d388536630..594e325b40e4 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -160,15 +160,15 @@ put_reg(struct task_struct *task, int regno, unsigned long data) static inline int is_user_addr_valid(struct task_struct *child, unsigned long start, unsigned long len) { - struct vm_list_struct *vml; + struct vm_area_struct *vma; struct sram_list_struct *sraml; /* overflow */ if (start + len < start) return -EIO; - for (vml = child->mm->context.vmlist; vml; vml = vml->next) - if (start >= vml->vma->vm_start && start + len < vml->vma->vm_end) + vma = find_vma(child->mm, start); + if (vma && start >= vma->vm_start && start + len <= vma->vm_end) return 0; for (sraml = child->mm->context.sram_list; sraml; sraml = sraml->next) diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c index 17d8e4172896..5b0667da8d05 100644 --- a/arch/blackfin/kernel/traps.c +++ b/arch/blackfin/kernel/traps.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,7 @@ static void decode_address(char *buf, unsigned long address) struct mm_struct *mm; unsigned long flags, offset; unsigned char in_atomic = (bfin_read_IPEND() & 0x10) || in_atomic(); + struct rb_node *n; #ifdef CONFIG_KALLSYMS unsigned long symsize; @@ -128,9 +130,10 @@ static void decode_address(char *buf, unsigned long address) if (!mm) continue; - vml = mm->context.vmlist; - while (vml) { - struct vm_area_struct *vma = vml->vma; + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + struct vm_area_struct *vma; + + vma = rb_entry(n, struct vm_area_struct, vm_rb); if (address >= vma->vm_start && address < vma->vm_end) { char _tmpbuf[256]; @@ -176,8 +179,6 @@ static void decode_address(char *buf, unsigned long address) goto done; } - - vml = vml->next; } if (!in_atomic) mmput(mm); diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c index 709e9bdc6126..5e7d401d21e7 100644 --- a/arch/frv/kernel/ptrace.c +++ b/arch/frv/kernel/ptrace.c @@ -69,7 +69,8 @@ static inline int put_reg(struct task_struct *task, int regno, } /* - * check that an address falls within the bounds of the target process's memory mappings + * check that an address falls within the bounds of the target process's memory + * mappings */ static inline int is_user_addr_valid(struct task_struct *child, unsigned long start, unsigned long len) @@ -79,11 +80,11 @@ static inline int is_user_addr_valid(struct task_struct *child, return -EIO; return 0; #else - struct vm_list_struct *vml; + struct vm_area_struct *vma; - for (vml = child->mm->context.vmlist; vml; vml = vml->next) - if (start >= vml->vma->vm_start && start + len <= vml->vma->vm_end) - return 0; + vma = find_vma(child->mm, start); + if (vma && start >= vma->vm_start && start + len <= vma->vm_end) + return 0; return -EIO; #endif diff --git a/arch/h8300/include/asm/mmu.h b/arch/h8300/include/asm/mmu.h index 2ce06ea46104..31309969df70 100644 --- a/arch/h8300/include/asm/mmu.h +++ b/arch/h8300/include/asm/mmu.h @@ -4,7 +4,6 @@ /* Copyright (C) 2002, David McCullough */ typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; } mm_context_t; diff --git a/arch/m68knommu/include/asm/mmu.h b/arch/m68knommu/include/asm/mmu.h index 5fa6b68353ba..e2da1e6f09fe 100644 --- a/arch/m68knommu/include/asm/mmu.h +++ b/arch/m68knommu/include/asm/mmu.h @@ -4,7 +4,6 @@ /* Copyright (C) 2002, David McCullough */ typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; } mm_context_t; diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h index fdcb93bc6d11..6c43625bb1a5 100644 --- a/arch/sh/include/asm/mmu.h +++ b/arch/sh/include/asm/mmu.h @@ -9,7 +9,6 @@ typedef struct { mm_context_id_t id; void *vdso; #else - struct vm_list_struct *vmlist; unsigned long end_brk; #endif #ifdef CONFIG_BINFMT_ELF_FDPIC diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index aa5b43205e37..22baf1b13493 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1567,11 +1567,9 @@ end_coredump: static int elf_fdpic_dump_segments(struct file *file, size_t *size, unsigned long *limit, unsigned long mm_flags) { - struct vm_list_struct *vml; - - for (vml = current->mm->context.vmlist; vml; vml = vml->next) { - struct vm_area_struct *vma = vml->vma; + struct vm_area_struct *vma; + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { if (!maydump(vma, mm_flags)) continue; @@ -1617,9 +1615,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, elf_fpxregset_t *xfpu = NULL; #endif int thread_status_size = 0; -#ifndef CONFIG_MMU - struct vm_list_struct *vml; -#endif elf_addr_t *auxv; unsigned long mm_flags; @@ -1685,13 +1680,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, fill_prstatus(prstatus, current, signr); elf_core_copy_regs(&prstatus->pr_reg, regs); -#ifdef CONFIG_MMU segs = current->mm->map_count; -#else - segs = 0; - for (vml = current->mm->context.vmlist; vml; vml = vml->next) - segs++; -#endif #ifdef ELF_CORE_EXTRA_PHDRS segs += ELF_CORE_EXTRA_PHDRS; #endif @@ -1766,20 +1755,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, mm_flags = current->mm->flags; /* write program headers for segments dump */ - for ( -#ifdef CONFIG_MMU - vma = current->mm->mmap; vma; vma = vma->vm_next -#else - vml = current->mm->context.vmlist; vml; vml = vml->next -#endif - ) { + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { struct elf_phdr phdr; size_t sz; -#ifndef CONFIG_MMU - vma = vml->vma; -#endif - sz = vma->vm_end - vma->vm_start; phdr.p_type = PT_LOAD; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3e8aeb8b61ce..cd53ff838498 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -41,8 +41,6 @@ do { \ (vmi)->used = 0; \ (vmi)->largest_chunk = 0; \ } while(0) - -extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); #endif extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b1675c4e66da..43d23948384a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -73,6 +73,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" "LowFree: %8lu kB\n" +#endif +#ifndef CONFIG_MMU + "MmapCopy: %8lu kB\n" #endif "SwapTotal: %8lu kB\n" "SwapFree: %8lu kB\n" @@ -115,6 +118,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freehigh), K(i.totalram-i.totalhigh), K(i.freeram-i.freehigh), +#endif +#ifndef CONFIG_MMU + K((unsigned long) atomic_read(&mmap_pages_allocated)), #endif K(i.totalswap), K(i.freeswap), diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 3f87d2632947..b446d7ad0b0d 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -33,33 +33,33 @@ #include "internal.h" /* - * display a single VMA to a sequenced file + * display a single region to a sequenced file */ -int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +static int nommu_region_show(struct seq_file *m, struct vm_region *region) { unsigned long ino = 0; struct file *file; dev_t dev = 0; int flags, len; - flags = vma->vm_flags; - file = vma->vm_file; + flags = region->vm_flags; + file = region->vm_file; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = region->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; } seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", - vma->vm_start, - vma->vm_end, + region->vm_start, + region->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, + ((loff_t)region->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { @@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } /* - * display a list of all the VMAs the kernel knows about + * display a list of all the REGIONs the kernel knows about * - nommu kernals have a single flat list */ -static int nommu_vma_list_show(struct seq_file *m, void *v) +static int nommu_region_list_show(struct seq_file *m, void *_p) { - struct vm_area_struct *vma; + struct rb_node *p = _p; - vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); - return nommu_vma_show(m, vma); + return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); } -static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) +static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) { - struct rb_node *_rb; + struct rb_node *p; loff_t pos = *_pos; - void *next = NULL; - down_read(&nommu_vma_sem); + down_read(&nommu_region_sem); - for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { - if (pos == 0) { - next = _rb; - break; - } - pos--; - } - - return next; + for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) + if (pos-- == 0) + return p; + return NULL; } -static void nommu_vma_list_stop(struct seq_file *m, void *v) +static void nommu_region_list_stop(struct seq_file *m, void *v) { - up_read(&nommu_vma_sem); + up_read(&nommu_region_sem); } -static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) +static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return rb_next((struct rb_node *) v); } -static const struct seq_operations proc_nommu_vma_list_seqop = { - .start = nommu_vma_list_start, - .next = nommu_vma_list_next, - .stop = nommu_vma_list_stop, - .show = nommu_vma_list_show +static struct seq_operations proc_nommu_region_list_seqop = { + .start = nommu_region_list_start, + .next = nommu_region_list_next, + .stop = nommu_region_list_stop, + .show = nommu_region_list_show }; -static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) +static int proc_nommu_region_list_open(struct inode *inode, struct file *file) { - return seq_open(file, &proc_nommu_vma_list_seqop); + return seq_open(file, &proc_nommu_region_list_seqop); } -static const struct file_operations proc_nommu_vma_list_operations = { - .open = proc_nommu_vma_list_open, +static const struct file_operations proc_nommu_region_list_operations = { + .open = proc_nommu_region_list_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = { static int __init proc_nommu_init(void) { - proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); + proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); return 0; } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index d4a8be32b902..ca4a48d0d311 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -15,25 +15,25 @@ */ void task_mem(struct seq_file *m, struct mm_struct *mm) { - struct vm_list_struct *vml; + struct vm_area_struct *vma; + struct rb_node *p; unsigned long bytes = 0, sbytes = 0, slack = 0; down_read(&mm->mmap_sem); - for (vml = mm->context.vmlist; vml; vml = vml->next) { - if (!vml->vma) - continue; + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); - bytes += kobjsize(vml); + bytes += kobjsize(vma); if (atomic_read(&mm->mm_count) > 1 || - atomic_read(&vml->vma->vm_usage) > 1 - ) { - sbytes += kobjsize((void *) vml->vma->vm_start); - sbytes += kobjsize(vml->vma); + vma->vm_region || + vma->vm_flags & VM_MAYSHARE) { + sbytes += kobjsize((void *) vma->vm_start); + if (vma->vm_region) + sbytes += kobjsize(vma->vm_region); } else { - bytes += kobjsize((void *) vml->vma->vm_start); - bytes += kobjsize(vml->vma); - slack += kobjsize((void *) vml->vma->vm_start) - - (vml->vma->vm_end - vml->vma->vm_start); + bytes += kobjsize((void *) vma->vm_start); + slack += kobjsize((void *) vma->vm_start) - + (vma->vm_end - vma->vm_start); } } @@ -70,13 +70,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long task_vsize(struct mm_struct *mm) { - struct vm_list_struct *tbp; + struct vm_area_struct *vma; + struct rb_node *p; unsigned long vsize = 0; down_read(&mm->mmap_sem); - for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { - if (tbp->vma) - vsize += kobjsize((void *) tbp->vma->vm_start); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + vsize += vma->vm_region->vm_end - vma->vm_region->vm_start; } up_read(&mm->mmap_sem); return vsize; @@ -85,16 +86,15 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - struct vm_list_struct *tbp; + struct vm_area_struct *vma; + struct rb_node *p; int size = kobjsize(mm); down_read(&mm->mmap_sem); - for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { - size += kobjsize(tbp); - if (tbp->vma) { - size += kobjsize(tbp->vma); - size += kobjsize((void *) tbp->vma->vm_start); - } + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + size += kobjsize(vma); + size += kobjsize((void *) vma->vm_start); } size += (*text = mm->end_code - mm->start_code); @@ -104,21 +104,63 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, return size; } +/* + * display a single VMA to a sequenced file + */ +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +{ + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags, len; + + flags = vma->vm_flags; + file = vma->vm_file; + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } + + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + vma->vm_start, + vma->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + vma->vm_pgoff << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino, &len); + + if (file) { + len = 25 + sizeof(void *) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); + seq_path(m, &file->f_path, ""); + } + + seq_putc(m, '\n'); + return 0; +} + /* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_vml) +static int show_map(struct seq_file *m, void *_p) { - struct vm_list_struct *vml = _vml; + struct rb_node *p = _p; - return nommu_vma_show(m, vml->vma); + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); } static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; - struct vm_list_struct *vml; struct mm_struct *mm; + struct rb_node *p; loff_t n = *pos; /* pin the task and mm whilst we play with them */ @@ -134,9 +176,9 @@ static void *m_start(struct seq_file *m, loff_t *pos) } /* start from the Nth VMA */ - for (vml = mm->context.vmlist; vml; vml = vml->next) + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) if (n-- == 0) - return vml; + return p; return NULL; } @@ -152,12 +194,12 @@ static void m_stop(struct seq_file *m, void *_vml) } } -static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *pos) { - struct vm_list_struct *vml = _vml; + struct rb_node *p = _p; (*pos)++; - return vml ? vml->next : NULL; + return p ? rb_next(p) : NULL; } static const struct seq_operations proc_pid_maps_ops = { diff --git a/include/asm-frv/mmu.h b/include/asm-frv/mmu.h index 22c03714fb14..86ca0e86e7d2 100644 --- a/include/asm-frv/mmu.h +++ b/include/asm-frv/mmu.h @@ -22,7 +22,6 @@ typedef struct { unsigned long dtlb_ptd_mapping; /* [DAMR5] PTD mapping for dtlb cached PGE */ #else - struct vm_list_struct *vmlist; unsigned long end_brk; #endif diff --git a/include/asm-m32r/mmu.h b/include/asm-m32r/mmu.h index d9bd724479cf..150cb92bb666 100644 --- a/include/asm-m32r/mmu.h +++ b/include/asm-m32r/mmu.h @@ -4,7 +4,6 @@ #if !defined(CONFIG_MMU) typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; } mm_context_t; diff --git a/include/linux/mm.h b/include/linux/mm.h index 4a3d28c86443..b91a73fd1bcc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -56,19 +56,9 @@ extern unsigned long mmap_min_addr; extern struct kmem_cache *vm_area_cachep; -/* - * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is - * disabled, then there's a single shared list of VMAs maintained by the - * system, and mm's subscribe to these individually - */ -struct vm_list_struct { - struct vm_list_struct *next; - struct vm_area_struct *vma; -}; - #ifndef CONFIG_MMU -extern struct rb_root nommu_vma_tree; -extern struct rw_semaphore nommu_vma_sem; +extern struct rb_root nommu_region_tree; +extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif @@ -1061,6 +1051,7 @@ extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); extern void setup_per_zone_pages_min(void); extern void mem_init(void); +extern void __init mmap_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); @@ -1072,6 +1063,9 @@ extern void setup_per_cpu_pageset(void); static inline void setup_per_cpu_pageset(void) {} #endif +/* nommu.c */ +extern atomic_t mmap_pages_allocated; + /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9cfc9b627fdd..1c1e0d3a1714 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,22 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ }; +/* + * A region containing a mapping of a non-memory backed file under NOMMU + * conditions. These are held in a global tree and are pinned by the VMAs that + * map parts of them. + */ +struct vm_region { + struct rb_node vm_rb; /* link in global region tree */ + unsigned long vm_flags; /* VMA vm_flags */ + unsigned long vm_start; /* start address of region */ + unsigned long vm_end; /* region initialised to here */ + unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ + struct file *vm_file; /* the backing file or NULL */ + + atomic_t vm_usage; /* region usage count */ +}; + /* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -152,7 +168,7 @@ struct vm_area_struct { unsigned long vm_truncate_count;/* truncate_count or restart_addr */ #ifndef CONFIG_MMU - atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */ + struct vm_region *vm_region; /* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ diff --git a/ipc/shm.c b/ipc/shm.c index b125b560240e..d0ab5527bf45 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -990,6 +990,7 @@ asmlinkage long sys_shmdt(char __user *shmaddr) */ vma = find_vma(mm, addr); +#ifdef CONFIG_MMU while (vma) { next = vma->vm_next; @@ -1034,6 +1035,17 @@ asmlinkage long sys_shmdt(char __user *shmaddr) vma = next; } +#else /* CONFIG_MMU */ + /* under NOMMU conditions, the exact address to be destroyed must be + * given */ + retval = -EINVAL; + if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + retval = 0; + } + +#endif + up_write(&mm->mmap_sem); return retval; } diff --git a/kernel/fork.c b/kernel/fork.c index 7b8f2a78be3d..0bce4a43bb37 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1481,12 +1481,10 @@ void __init proc_caches_init(void) fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + mmap_init(); } /* diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2e75478e9c69..d0a32aab03ff 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -512,6 +512,13 @@ config DEBUG_VIRTUAL If unsure, say N. +config DEBUG_NOMMU_REGIONS + bool "Debug the global anon/private NOMMU mapping region tree" + depends on DEBUG_KERNEL && !MMU + help + This option causes the global tree of anonymous and private mapping + regions to be regularly checked for invalid topology. + config DEBUG_WRITECOUNT bool "Debug filesystem writers count" depends on DEBUG_KERNEL diff --git a/mm/mmap.c b/mm/mmap.c index a910c045cfd4..749623196cb9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2472,3 +2472,13 @@ void mm_drop_all_locks(struct mm_struct *mm) mutex_unlock(&mm_all_locks_mutex); } + +/* + * initialise the VMA slab + */ +void __init mmap_init(void) +{ + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL); +} diff --git a/mm/nommu.c b/mm/nommu.c index 23f355bbe262..0d363dfcf10e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -6,7 +6,7 @@ * * See Documentation/nommu-mmap.txt * - * Copyright (c) 2004-2005 David Howells + * Copyright (c) 2004-2008 David Howells * Copyright (c) 2000-2003 David McCullough * Copyright (c) 2000-2001 D Jeff Dionne * Copyright (c) 2002 Greg Ungerer @@ -33,6 +33,28 @@ #include #include #include +#include "internal.h" + +static inline __attribute__((format(printf, 1, 2))) +void no_printk(const char *fmt, ...) +{ +} + +#if 0 +#define kenter(FMT, ...) \ + printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) +#else +#define kenter(FMT, ...) \ + no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) +#endif #include "internal.h" @@ -46,12 +68,15 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int heap_stack_gap = 0; +atomic_t mmap_pages_allocated; + EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(num_physpages); -/* list of shareable VMAs */ -struct rb_root nommu_vma_tree = RB_ROOT; -DECLARE_RWSEM(nommu_vma_sem); +/* list of mapped, potentially shareable regions */ +static struct kmem_cache *vm_region_jar; +struct rb_root nommu_region_tree = RB_ROOT; +DECLARE_RWSEM(nommu_region_sem); struct vm_operations_struct generic_file_vm_ops = { }; @@ -400,129 +425,174 @@ asmlinkage unsigned long sys_brk(unsigned long brk) return mm->brk = brk; } -#ifdef DEBUG -static void show_process_blocks(void) +/* + * initialise the VMA and region record slabs + */ +void __init mmap_init(void) { - struct vm_list_struct *vml; - - printk("Process blocks %d:", current->pid); - - for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { - printk(" %p: %p", vml, vml->vma); - if (vml->vma) - printk(" (%d @%lx #%d)", - kobjsize((void *) vml->vma->vm_start), - vml->vma->vm_start, - atomic_read(&vml->vma->vm_usage)); - printk(vml->next ? " ->" : ".\n"); - } + vm_region_jar = kmem_cache_create("vm_region_jar", + sizeof(struct vm_region), 0, + SLAB_PANIC, NULL); + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL); } -#endif /* DEBUG */ /* - * add a VMA into a process's mm_struct in the appropriate place in the list - * - should be called with mm->mmap_sem held writelocked + * validate the region tree + * - the caller must hold the region lock */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) +#ifdef CONFIG_DEBUG_NOMMU_REGIONS +static noinline void validate_nommu_regions(void) { - struct vm_list_struct **ppv; + struct vm_region *region, *last; + struct rb_node *p, *lastp; - for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) - if ((*ppv)->vma->vm_start > vml->vma->vm_start) - break; + lastp = rb_first(&nommu_region_tree); + if (!lastp) + return; + + last = rb_entry(lastp, struct vm_region, vm_rb); + if (unlikely(last->vm_end <= last->vm_start)) + BUG(); + + while ((p = rb_next(lastp))) { + region = rb_entry(p, struct vm_region, vm_rb); + last = rb_entry(lastp, struct vm_region, vm_rb); + + if (unlikely(region->vm_end <= region->vm_start)) + BUG(); + if (unlikely(region->vm_start < last->vm_end)) + BUG(); - vml->next = *ppv; - *ppv = vml; + lastp = p; + } } +#else +#define validate_nommu_regions() do {} while(0) +#endif /* - * look up the first VMA in which addr resides, NULL if none - * - should be called with mm->mmap_sem at least held readlocked + * add a region into the global tree */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +static void add_nommu_region(struct vm_region *region) { - struct vm_list_struct *loop, *vml; + struct vm_region *pregion; + struct rb_node **p, *parent; - /* search the vm_start ordered list */ - vml = NULL; - for (loop = mm->context.vmlist; loop; loop = loop->next) { - if (loop->vma->vm_start > addr) - break; - vml = loop; + validate_nommu_regions(); + + BUG_ON(region->vm_start & ~PAGE_MASK); + + parent = NULL; + p = &nommu_region_tree.rb_node; + while (*p) { + parent = *p; + pregion = rb_entry(parent, struct vm_region, vm_rb); + if (region->vm_start < pregion->vm_start) + p = &(*p)->rb_left; + else if (region->vm_start > pregion->vm_start) + p = &(*p)->rb_right; + else if (pregion == region) + return; + else + BUG(); } - if (vml && vml->vma->vm_end > addr) - return vml->vma; + rb_link_node(®ion->vm_rb, parent, p); + rb_insert_color(®ion->vm_rb, &nommu_region_tree); - return NULL; + validate_nommu_regions(); } -EXPORT_SYMBOL(find_vma); /* - * find a VMA - * - we don't extend stack VMAs under NOMMU conditions + * delete a region from the global tree */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +static void delete_nommu_region(struct vm_region *region) { - return find_vma(mm, addr); -} + BUG_ON(!nommu_region_tree.rb_node); -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - return -ENOMEM; + validate_nommu_regions(); + rb_erase(®ion->vm_rb, &nommu_region_tree); + validate_nommu_regions(); } /* - * look up the first VMA exactly that exactly matches addr - * - should be called with mm->mmap_sem at least held readlocked + * free a contiguous series of pages */ -static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, - unsigned long addr) +static void free_page_series(unsigned long from, unsigned long to) { - struct vm_list_struct *vml; - - /* search the vm_start ordered list */ - for (vml = mm->context.vmlist; vml; vml = vml->next) { - if (vml->vma->vm_start == addr) - return vml->vma; - if (vml->vma->vm_start > addr) - break; + for (; from < to; from += PAGE_SIZE) { + struct page *page = virt_to_page(from); + + kdebug("- free %lx", from); + atomic_dec(&mmap_pages_allocated); + if (page_count(page) != 1) + kdebug("free page %p [%d]", page, page_count(page)); + put_page(page); } - - return NULL; } /* - * find a VMA in the global tree + * release a reference to a region + * - the caller must hold the region semaphore, which this releases + * - the region may not have been added to the tree yet, in which case vm_end + * will equal vm_start */ -static inline struct vm_area_struct *find_nommu_vma(unsigned long start) +static void __put_nommu_region(struct vm_region *region) + __releases(nommu_region_sem) { - struct vm_area_struct *vma; - struct rb_node *n = nommu_vma_tree.rb_node; + kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); - while (n) { - vma = rb_entry(n, struct vm_area_struct, vm_rb); + BUG_ON(!nommu_region_tree.rb_node); - if (start < vma->vm_start) - n = n->rb_left; - else if (start > vma->vm_start) - n = n->rb_right; - else - return vma; + if (atomic_dec_and_test(®ion->vm_usage)) { + if (region->vm_end > region->vm_start) + delete_nommu_region(region); + up_write(&nommu_region_sem); + + if (region->vm_file) + fput(region->vm_file); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ + if (region->vm_flags & VM_MAPPED_COPY) { + kdebug("free series"); + free_page_series(region->vm_start, region->vm_end); + } + kmem_cache_free(vm_region_jar, region); + } else { + up_write(&nommu_region_sem); } +} - return NULL; +/* + * release a reference to a region + */ +static void put_nommu_region(struct vm_region *region) +{ + down_write(&nommu_region_sem); + __put_nommu_region(region); } /* - * add a VMA in the global tree + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_sem held writelocked */ -static void add_nommu_vma(struct vm_area_struct *vma) +static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma; + struct vm_area_struct *pvma, **pp; struct address_space *mapping; - struct rb_node **p = &nommu_vma_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p, *parent; + + kenter(",%p", vma); + + BUG_ON(!vma->vm_region); + + mm->map_count++; + vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { @@ -533,42 +603,62 @@ static void add_nommu_vma(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); } - /* add the VMA to the master list */ + /* add the VMA to the tree */ + parent = NULL; + p = &mm->mm_rb.rb_node; while (*p) { parent = *p; pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - if (vma->vm_start < pvma->vm_start) { + /* sort by: start addr, end addr, VMA struct addr in that order + * (the latter is necessary as we may get identical VMAs) */ + if (vma->vm_start < pvma->vm_start) p = &(*p)->rb_left; - } - else if (vma->vm_start > pvma->vm_start) { + else if (vma->vm_start > pvma->vm_start) p = &(*p)->rb_right; - } - else { - /* mappings are at the same address - this can only - * happen for shared-mem chardevs and shared file - * mappings backed by ramfs/tmpfs */ - BUG_ON(!(pvma->vm_flags & VM_SHARED)); - - if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) - p = &(*p)->rb_right; - else - BUG(); - } + else if (vma->vm_end < pvma->vm_end) + p = &(*p)->rb_left; + else if (vma->vm_end > pvma->vm_end) + p = &(*p)->rb_right; + else if (vma < pvma) + p = &(*p)->rb_left; + else if (vma > pvma) + p = &(*p)->rb_right; + else + BUG(); } rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &nommu_vma_tree); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); + + /* add VMA to the VMA list also */ + for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { + if (pvma->vm_start > vma->vm_start) + break; + if (pvma->vm_start < vma->vm_start) + continue; + if (pvma->vm_end < vma->vm_end) + break; + } + + vma->vm_next = *pp; + *pp = vma; } /* - * delete a VMA from the global list + * delete a VMA from its owning mm_struct and address space */ -static void delete_nommu_vma(struct vm_area_struct *vma) +static void delete_vma_from_mm(struct vm_area_struct *vma) { + struct vm_area_struct **pp; struct address_space *mapping; + struct mm_struct *mm = vma->vm_mm; + + kenter("%p", vma); + + mm->map_count--; + if (mm->mmap_cache == vma) + mm->mmap_cache = NULL; /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -579,8 +669,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); } - /* remove from the master list */ - rb_erase(&vma->vm_rb, &nommu_vma_tree); + /* remove from the MM's tree and list */ + rb_erase(&vma->vm_rb, &mm->mm_rb); + for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { + if (*pp == vma) { + *pp = vma->vm_next; + break; + } + } + + vma->vm_mm = NULL; +} + +/* + * destroy a VMA record + */ +static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + kenter("%p", vma); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } + put_nommu_region(vma->vm_region); + kmem_cache_free(vm_area_cachep, vma); +} + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_sem at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + struct rb_node *n = mm->mm_rb.rb_node; + + /* check the cache first */ + vma = mm->mmap_cache; + if (vma && vma->vm_start <= addr && vma->vm_end > addr) + return vma; + + /* trawl the tree (there may be multiple mappings in which addr + * resides) */ + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end > addr) { + mm->mmap_cache = vma; + return vma; + } + } + + return NULL; +} +EXPORT_SYMBOL(find_vma); + +/* + * find a VMA + * - we don't extend stack VMAs under NOMMU conditions + */ +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return -ENOMEM; +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_sem at least held readlocked + */ +static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct rb_node *n = mm->mm_rb.rb_node; + unsigned long end = addr + len; + + /* check the cache first */ + vma = mm->mmap_cache; + if (vma && vma->vm_start == addr && vma->vm_end == end) + return vma; + + /* trawl the tree (there may be multiple mappings in which addr + * resides) */ + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + if (vma->vm_start < addr) + continue; + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end == end) { + mm->mmap_cache = vma; + return vma; + } + } + + return NULL; } /* @@ -595,7 +792,7 @@ static int validate_mmap_request(struct file *file, unsigned long pgoff, unsigned long *_capabilities) { - unsigned long capabilities; + unsigned long capabilities, rlen; unsigned long reqprot = prot; int ret; @@ -615,12 +812,12 @@ static int validate_mmap_request(struct file *file, return -EINVAL; /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) + rlen = PAGE_ALIGN(len); + if (!rlen || rlen > TASK_SIZE) return -ENOMEM; /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; if (file) { @@ -794,9 +991,10 @@ static unsigned long determine_vm_flags(struct file *file, } /* - * set up a shared mapping on a file + * set up a shared mapping on a file (the driver or filesystem provides and + * pins the storage) */ -static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) +static int do_mmap_shared_file(struct vm_area_struct *vma) { int ret; @@ -814,10 +1012,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) /* * set up a private mapping or an anonymous shared mapping */ -static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) +static int do_mmap_private(struct vm_area_struct *vma, + struct vm_region *region, + unsigned long len) { + struct page *pages; + unsigned long total, point, n, rlen; void *base; - int ret; + int ret, order; /* invoke the file's mapping function so that it can keep track of * shared mappings on devices or memory @@ -836,23 +1038,46 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) * make a private copy of the data and map that instead */ } + rlen = PAGE_ALIGN(len); + /* allocate some memory to hold the mapping * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - base = kmalloc(len, GFP_KERNEL|__GFP_COMP); - if (!base) + order = get_order(rlen); + kdebug("alloc order %d for %lx", order, len); + + pages = alloc_pages(GFP_KERNEL, order); + if (!pages) goto enomem; - vma->vm_start = (unsigned long) base; - vma->vm_end = vma->vm_start + len; - vma->vm_flags |= VM_MAPPED_COPY; + /* we allocated a power-of-2 sized page set, so we need to trim off the + * excess */ + total = 1 << order; + atomic_add(total, &mmap_pages_allocated); + + point = rlen >> PAGE_SHIFT; + while (total > point) { + order = ilog2(total - point); + n = 1 << order; + kdebug("shave %lu/%lu @%lu", n, total - point, total); + atomic_sub(n, &mmap_pages_allocated); + total -= n; + set_page_refcounted(pages + total); + __free_pages(pages + total, order); + } + + total = rlen >> PAGE_SHIFT; + for (point = 1; point < total; point++) + set_page_refcounted(&pages[point]); -#ifdef WARN_ON_SLACK - if (len + WARN_ON_SLACK <= kobjsize(result)) - printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", - len, current->pid, kobjsize(result) - len); -#endif + base = page_address(pages); + region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + region->vm_start = (unsigned long) base; + region->vm_end = region->vm_start + rlen; + + vma->vm_start = region->vm_start; + vma->vm_end = region->vm_start + len; if (vma->vm_file) { /* read the contents of a file into the copy */ @@ -864,26 +1089,27 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) old_fs = get_fs(); set_fs(KERNEL_DS); - ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); + ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); set_fs(old_fs); if (ret < 0) goto error_free; /* clear the last little bit */ - if (ret < len) - memset(base + ret, 0, len - ret); + if (ret < rlen) + memset(base + ret, 0, rlen - ret); } else { /* if it's an anonymous mapping, then just clear it */ - memset(base, 0, len); + memset(base, 0, rlen); } return 0; error_free: - kfree(base); - vma->vm_start = 0; + free_page_series(region->vm_start, region->vm_end); + region->vm_start = vma->vm_start = 0; + region->vm_end = vma->vm_end = 0; return ret; enomem: @@ -903,13 +1129,14 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long flags, unsigned long pgoff) { - struct vm_list_struct *vml = NULL; - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma; + struct vm_region *region; struct rb_node *rb; - unsigned long capabilities, vm_flags; - void *result; + unsigned long capabilities, vm_flags, result; int ret; + kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -917,73 +1144,120 @@ unsigned long do_mmap_pgoff(struct file *file, * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, &capabilities); - if (ret < 0) + if (ret < 0) { + kleave(" = %d [val]", ret); return ret; + } /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); - /* we're going to need to record the mapping if it works */ - vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); - if (!vml) - goto error_getting_vml; + /* we're going to need to record the mapping */ + region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); + if (!region) + goto error_getting_region; + + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + goto error_getting_vma; + + atomic_set(®ion->vm_usage, 1); + region->vm_flags = vm_flags; + region->vm_pgoff = pgoff; - down_write(&nommu_vma_sem); + INIT_LIST_HEAD(&vma->anon_vma_node); + vma->vm_flags = vm_flags; + vma->vm_pgoff = pgoff; - /* if we want to share, we need to check for VMAs created by other + if (file) { + region->vm_file = file; + get_file(file); + vma->vm_file = file; + get_file(file); + if (vm_flags & VM_EXECUTABLE) { + added_exe_file_vma(current->mm); + vma->vm_mm = current->mm; + } + } + + down_write(&nommu_region_sem); + + /* if we want to share, we need to check for regions created by other * mmap() calls that overlap with our proposed mapping - * - we can only share with an exact match on most regular files + * - we can only share with a superset match on most regular files * - shared mappings on character devices and memory backed files are * permitted to overlap inexactly as far as we are concerned for in * these cases, sharing is handled in the driver or filesystem rather * than here */ if (vm_flags & VM_MAYSHARE) { - unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vmpglen; + struct vm_region *pregion; + unsigned long pglen, rpglen, pgend, rpgend, start; - /* suppress VMA sharing for shared regions */ - if (vm_flags & VM_SHARED && - capabilities & BDI_CAP_MAP_DIRECT) - goto dont_share_VMAs; + pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgend = pgoff + pglen; - for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { - vma = rb_entry(rb, struct vm_area_struct, vm_rb); + for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { + pregion = rb_entry(rb, struct vm_region, vm_rb); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (!(pregion->vm_flags & VM_MAYSHARE)) continue; /* search for overlapping mappings on the same file */ - if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) + if (pregion->vm_file->f_path.dentry->d_inode != + file->f_path.dentry->d_inode) continue; - if (vma->vm_pgoff >= pgoff + pglen) + if (pregion->vm_pgoff >= pgend) continue; - vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; - vmpglen >>= PAGE_SHIFT; - if (pgoff >= vma->vm_pgoff + vmpglen) + rpglen = pregion->vm_end - pregion->vm_start; + rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; + rpgend = pregion->vm_pgoff + rpglen; + if (pgoff >= rpgend) continue; - /* handle inexactly overlapping matches between mappings */ - if (vma->vm_pgoff != pgoff || vmpglen != pglen) { + /* handle inexactly overlapping matches between + * mappings */ + if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && + !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { + /* new mapping is not a subset of the region */ if (!(capabilities & BDI_CAP_MAP_DIRECT)) goto sharing_violation; continue; } - /* we've found a VMA we can share */ - atomic_inc(&vma->vm_usage); - - vml->vma = vma; - result = (void *) vma->vm_start; - goto shared; + /* we've found a region we can share */ + atomic_inc(&pregion->vm_usage); + vma->vm_region = pregion; + start = pregion->vm_start; + start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; + vma->vm_start = start; + vma->vm_end = start + len; + + if (pregion->vm_flags & VM_MAPPED_COPY) { + kdebug("share copy"); + vma->vm_flags |= VM_MAPPED_COPY; + } else { + kdebug("share mmap"); + ret = do_mmap_shared_file(vma); + if (ret < 0) { + vma->vm_region = NULL; + vma->vm_start = 0; + vma->vm_end = 0; + atomic_dec(&pregion->vm_usage); + pregion = NULL; + goto error_just_free; + } + } + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; + goto share; } - dont_share_VMAs: - vma = NULL; - /* obtain the address at which to make a shared mapping * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping @@ -994,102 +1268,93 @@ unsigned long do_mmap_pgoff(struct file *file, if (IS_ERR((void *) addr)) { ret = addr; if (ret != (unsigned long) -ENOSYS) - goto error; + goto error_just_free; /* the driver refused to tell us where to site * the mapping so we'll have to attempt to copy * it */ ret = (unsigned long) -ENODEV; if (!(capabilities & BDI_CAP_MAP_COPY)) - goto error; + goto error_just_free; capabilities &= ~BDI_CAP_MAP_DIRECT; + } else { + vma->vm_start = region->vm_start = addr; + vma->vm_end = region->vm_end = addr + len; } } } - /* we're going to need a VMA struct as well */ - vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); - if (!vma) - goto error_getting_vma; - - INIT_LIST_HEAD(&vma->anon_vma_node); - atomic_set(&vma->vm_usage, 1); - if (file) { - get_file(file); - if (vm_flags & VM_EXECUTABLE) { - added_exe_file_vma(current->mm); - vma->vm_mm = current->mm; - } - } - vma->vm_file = file; - vma->vm_flags = vm_flags; - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; - - vml->vma = vma; + vma->vm_region = region; /* set up the mapping */ if (file && vma->vm_flags & VM_SHARED) - ret = do_mmap_shared_file(vma, len); + ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, len); + ret = do_mmap_private(vma, region, len); if (ret < 0) - goto error; + goto error_put_region; + + add_nommu_region(region); /* okay... we have a mapping; now we have to register it */ - result = (void *) vma->vm_start; + result = vma->vm_start; current->mm->total_vm += len >> PAGE_SHIFT; - add_nommu_vma(vma); +share: + add_vma_to_mm(current->mm, vma); - shared: - add_vma_to_mm(current->mm, vml); - - up_write(&nommu_vma_sem); + up_write(&nommu_region_sem); if (prot & PROT_EXEC) - flush_icache_range((unsigned long) result, - (unsigned long) result + len); + flush_icache_range(result, result + len); -#ifdef DEBUG - printk("do_mmap:\n"); - show_process_blocks(); -#endif + kleave(" = %lx", result); + return result; - return (unsigned long) result; - - error: - up_write(&nommu_vma_sem); - kfree(vml); +error_put_region: + __put_nommu_region(region); if (vma) { if (vma->vm_file) { fput(vma->vm_file); if (vma->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(vma->vm_mm); } - kfree(vma); + kmem_cache_free(vm_area_cachep, vma); } + kleave(" = %d [pr]", ret); return ret; - sharing_violation: - up_write(&nommu_vma_sem); - printk("Attempt to share mismatched mappings\n"); - kfree(vml); - return -EINVAL; +error_just_free: + up_write(&nommu_region_sem); +error: + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + kmem_cache_free(vm_area_cachep, vma); + kleave(" = %d", ret); + return ret; + +sharing_violation: + up_write(&nommu_region_sem); + printk(KERN_WARNING "Attempt to share mismatched mappings\n"); + ret = -EINVAL; + goto error; - error_getting_vma: - up_write(&nommu_vma_sem); - kfree(vml); - printk("Allocation of vma for %lu byte allocation from process %d failed\n", +error_getting_vma: + kmem_cache_free(vm_region_jar, region); + printk(KERN_WARNING "Allocation of vma for %lu byte allocation" + " from process %d failed\n", len, current->pid); show_free_areas(); return -ENOMEM; - error_getting_vml: - printk("Allocation of vml for %lu byte allocation from process %d failed\n", +error_getting_region: + printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" + " from process %d failed\n", len, current->pid); show_free_areas(); return -ENOMEM; @@ -1097,77 +1362,180 @@ unsigned long do_mmap_pgoff(struct file *file, EXPORT_SYMBOL(do_mmap_pgoff); /* - * handle mapping disposal for uClinux + * split a vma into two pieces at address 'addr', a new vma is allocated either + * for the first part or the tail. */ -static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) { - if (vma) { - down_write(&nommu_vma_sem); + struct vm_area_struct *new; + struct vm_region *region; + unsigned long npages; - if (atomic_dec_and_test(&vma->vm_usage)) { - delete_nommu_vma(vma); + kenter(""); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + /* we're only permitted to split anonymous regions that have a single + * owner */ + if (vma->vm_file || + atomic_read(&vma->vm_region->vm_usage) != 1) + return -ENOMEM; - /* IO memory and memory shared directly out of the pagecache from - * ramfs/tmpfs mustn't be released here */ - if (vma->vm_flags & VM_MAPPED_COPY) - kfree((void *) vma->vm_start); + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } - kfree(vma); - } + region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); + if (!region) + return -ENOMEM; + + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) { + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; + } + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + *region = *vma->vm_region; + new->vm_region = region; + + npages = (addr - vma->vm_start) >> PAGE_SHIFT; + + if (new_below) { + region->vm_end = new->vm_end = addr; + } else { + region->vm_start = new->vm_start = addr; + region->vm_pgoff = new->vm_pgoff += npages; + } - up_write(&nommu_vma_sem); + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + delete_vma_from_mm(vma); + down_write(&nommu_region_sem); + delete_nommu_region(vma->vm_region); + if (new_below) { + vma->vm_region->vm_start = vma->vm_start = addr; + vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; + } else { + vma->vm_region->vm_end = vma->vm_end = addr; } + add_nommu_region(vma->vm_region); + add_nommu_region(new->vm_region); + up_write(&nommu_region_sem); + add_vma_to_mm(mm, vma); + add_vma_to_mm(mm, new); + return 0; } /* - * release a mapping - * - under NOMMU conditions the parameters must match exactly to the mapping to - * be removed + * shrink a VMA by removing the specified chunk from either the beginning or + * the end */ -int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +static int shrink_vma(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long from, unsigned long to) { - struct vm_list_struct *vml, **parent; - unsigned long end = addr + len; + struct vm_region *region; -#ifdef DEBUG - printk("do_munmap:\n"); -#endif + kenter(""); - for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { - if ((*parent)->vma->vm_start > addr) - break; - if ((*parent)->vma->vm_start == addr && - ((len == 0) || ((*parent)->vma->vm_end == end))) - goto found; - } + /* adjust the VMA's pointers, which may reposition it in the MM's tree + * and list */ + delete_vma_from_mm(vma); + if (from > vma->vm_start) + vma->vm_end = from; + else + vma->vm_start = to; + add_vma_to_mm(mm, vma); - printk("munmap of non-mmaped memory by process %d (%s): %p\n", - current->pid, current->comm, (void *) addr); - return -EINVAL; + /* cut the backing region down to size */ + region = vma->vm_region; + BUG_ON(atomic_read(®ion->vm_usage) != 1); - found: - vml = *parent; + down_write(&nommu_region_sem); + delete_nommu_region(region); + if (from > region->vm_start) + region->vm_end = from; + else + region->vm_start = to; + add_nommu_region(region); + up_write(&nommu_region_sem); - put_vma(mm, vml->vma); + free_page_series(from, to); + return 0; +} - *parent = vml->next; - kfree(vml); +/* + * release a mapping + * - under NOMMU conditions the chunk to be unmapped must be backed by a single + * VMA, though it need not cover the whole VMA + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + struct rb_node *rb; + unsigned long end = start + len; + int ret; - update_hiwater_vm(mm); - mm->total_vm -= len >> PAGE_SHIFT; + kenter(",%lx,%zx", start, len); -#ifdef DEBUG - show_process_blocks(); -#endif + if (len == 0) + return -EINVAL; + + /* find the first potentially overlapping VMA */ + vma = find_vma(mm, start); + if (!vma) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d (%s):" + " 0x%lx-0x%lx\n", + current->pid, current->comm, start, start + len - 1); + return -EINVAL; + } + /* we're allowed to split an anonymous VMA but not a file-backed one */ + if (vma->vm_file) { + do { + if (start > vma->vm_start) { + kleave(" = -EINVAL [miss]"); + return -EINVAL; + } + if (end == vma->vm_end) + goto erase_whole_vma; + rb = rb_next(&vma->vm_rb); + vma = rb_entry(rb, struct vm_area_struct, vm_rb); + } while (rb); + kleave(" = -EINVAL [split file]"); + return -EINVAL; + } else { + /* the chunk must be a subset of the VMA found */ + if (start == vma->vm_start && end == vma->vm_end) + goto erase_whole_vma; + if (start < vma->vm_start || end > vma->vm_end) { + kleave(" = -EINVAL [superset]"); + return -EINVAL; + } + if (start & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned start]"); + return -EINVAL; + } + if (end != vma->vm_end && end & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned split]"); + return -EINVAL; + } + if (start != vma->vm_start && end != vma->vm_end) { + ret = split_vma(mm, vma, start, 1); + if (ret < 0) { + kleave(" = %d [split]", ret); + return ret; + } + } + return shrink_vma(mm, vma, start, end); + } + +erase_whole_vma: + delete_vma_from_mm(vma); + delete_vma(mm, vma); + kleave(" = 0"); return 0; } EXPORT_SYMBOL(do_munmap); @@ -1184,29 +1552,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len) } /* - * Release all mappings + * release all the mappings made in a process's VM space */ -void exit_mmap(struct mm_struct * mm) +void exit_mmap(struct mm_struct *mm) { - struct vm_list_struct *tmp; + struct vm_area_struct *vma; - if (mm) { -#ifdef DEBUG - printk("Exit_mmap:\n"); -#endif + if (!mm) + return; - mm->total_vm = 0; + kenter(""); - while ((tmp = mm->context.vmlist)) { - mm->context.vmlist = tmp->next; - put_vma(mm, tmp->vma); - kfree(tmp); - } + mm->total_vm = 0; -#ifdef DEBUG - show_process_blocks(); -#endif + while ((vma = mm->mmap)) { + mm->mmap = vma->vm_next; + delete_vma_from_mm(vma); + delete_vma(mm, vma); } + + kleave(""); } unsigned long do_brk(unsigned long addr, unsigned long len) @@ -1219,8 +1584,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) * time (controlled by the MREMAP_MAYMOVE flag and available VM space) * * under NOMMU conditions, we only permit changing a mapping's size, and only - * as long as it stays within the hole allocated by the kmalloc() call in - * do_mmap_pgoff() and the block is not shareable + * as long as it stays within the region allocated by do_mmap_private() and the + * block is not shareable * * MREMAP_FIXED is not supported under NOMMU conditions */ @@ -1231,13 +1596,16 @@ unsigned long do_mremap(unsigned long addr, struct vm_area_struct *vma; /* insanity checks first */ - if (new_len == 0) + if (old_len == 0 || new_len == 0) return (unsigned long) -EINVAL; + if (addr & ~PAGE_MASK) + return -EINVAL; + if (flags & MREMAP_FIXED && new_addr != addr) return (unsigned long) -EINVAL; - vma = find_vma_exact(current->mm, addr); + vma = find_vma_exact(current->mm, addr, old_len); if (!vma) return (unsigned long) -EINVAL; @@ -1247,19 +1615,19 @@ unsigned long do_mremap(unsigned long addr, if (vma->vm_flags & VM_MAYSHARE) return (unsigned long) -EPERM; - if (new_len > kobjsize((void *) addr)) + if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) return (unsigned long) -ENOMEM; /* all checks complete - do it */ vma->vm_end = vma->vm_start + new_len; - return vma->vm_start; } EXPORT_SYMBOL(do_mremap); -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +asmlinkage +unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) { unsigned long ret; -- cgit v1.2.3 From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:54 +0100 Subject: [CVE-2009-0029] Convert all system calls to return a long Convert all system calls to return a long. This should be a NOP since all converted types should have the same size anyway. With the exception of sys_exit_group which returned void. But that doesn't matter since the system call doesn't return. Signed-off-by: Heiko Carstens --- fs/read_write.c | 18 +++++------ fs/xattr.c | 12 ++++---- include/linux/syscalls.h | 79 ++++++++++++++++++++++++------------------------ ipc/mqueue.c | 2 +- kernel/exit.c | 4 ++- kernel/signal.c | 2 +- kernel/timer.c | 2 +- mm/filemap.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/nommu.c | 2 +- 11 files changed, 64 insertions(+), 63 deletions(-) (limited to 'mm/mmap.c') diff --git a/fs/read_write.c b/fs/read_write.c index 5cc6924eb158..940367f51f2a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin) } EXPORT_SYMBOL(vfs_llseek); -asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin) +asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin) { off_t retval; struct file * file; @@ -369,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) file->f_pos = pos; } -asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) +asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; @@ -386,7 +386,7 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) return ret; } -asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count) +asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; @@ -403,7 +403,7 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co return ret; } -asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, +asmlinkage long sys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct file *file; @@ -424,7 +424,7 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, return ret; } -asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, +asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct file *file; @@ -672,7 +672,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_writev); -asmlinkage ssize_t +asmlinkage long sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { struct file *file; @@ -693,7 +693,7 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) return ret; } -asmlinkage ssize_t +asmlinkage long sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { struct file *file; @@ -812,7 +812,7 @@ out: return retval; } -asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) +asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) { loff_t pos; off_t off; @@ -831,7 +831,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, siz return do_sendfile(out_fd, in_fd, NULL, count, 0); } -asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) +asmlinkage long sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) { loff_t pos; ssize_t ret; diff --git a/fs/xattr.c b/fs/xattr.c index 237804cd6b56..d049ae27aae7 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -349,7 +349,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, return error; } -asmlinkage ssize_t +asmlinkage long sys_getxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { @@ -364,7 +364,7 @@ sys_getxattr(const char __user *pathname, const char __user *name, return error; } -asmlinkage ssize_t +asmlinkage long sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { @@ -379,7 +379,7 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user return error; } -asmlinkage ssize_t +asmlinkage long sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size) { struct file *f; @@ -424,7 +424,7 @@ listxattr(struct dentry *d, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_listxattr(const char __user *pathname, char __user *list, size_t size) { struct path path; @@ -438,7 +438,7 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_llistxattr(const char __user *pathname, char __user *list, size_t size) { struct path path; @@ -452,7 +452,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size) { struct file *f; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a7593f670ca6..22290eeaf553 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,7 +77,7 @@ asmlinkage long sys_times(struct tms __user *tbuf); asmlinkage long sys_gettid(void); asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp); -asmlinkage unsigned long sys_alarm(unsigned int seconds); +asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpid(void); asmlinkage long sys_getppid(void); asmlinkage long sys_getuid(void); @@ -166,7 +166,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, unsigned long flags); asmlinkage long sys_exit(int error_code); -asmlinkage void sys_exit_group(int error_code); +asmlinkage long sys_exit_group(int error_code); asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, int options, struct rusage __user *ru); asmlinkage long sys_waitid(int which, pid_t pid, @@ -196,7 +196,7 @@ asmlinkage long sys_tkill(int pid, int sig); asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo); asmlinkage long sys_sgetmask(void); asmlinkage long sys_ssetmask(int newmask); -asmlinkage unsigned long sys_signal(int sig, __sighandler_t handler); +asmlinkage long sys_signal(int sig, __sighandler_t handler); asmlinkage long sys_pause(void); asmlinkage long sys_sync(void); @@ -246,29 +246,29 @@ asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name, const void __user *value, size_t size, int flags); asmlinkage long sys_fsetxattr(int fd, const char __user *name, const void __user *value, size_t size, int flags); -asmlinkage ssize_t sys_getxattr(const char __user *path, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_lgetxattr(const char __user *path, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_fgetxattr(int fd, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_listxattr(const char __user *path, char __user *list, - size_t size); -asmlinkage ssize_t sys_llistxattr(const char __user *path, char __user *list, - size_t size); -asmlinkage ssize_t sys_flistxattr(int fd, char __user *list, size_t size); +asmlinkage long sys_getxattr(const char __user *path, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_fgetxattr(int fd, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_listxattr(const char __user *path, char __user *list, + size_t size); +asmlinkage long sys_llistxattr(const char __user *path, char __user *list, + size_t size); +asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size); asmlinkage long sys_removexattr(const char __user *path, const char __user *name); asmlinkage long sys_lremovexattr(const char __user *path, const char __user *name); asmlinkage long sys_fremovexattr(int fd, const char __user *name); -asmlinkage unsigned long sys_brk(unsigned long brk); +asmlinkage long sys_brk(unsigned long brk); asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot); -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr); +asmlinkage long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); @@ -321,10 +321,10 @@ asmlinkage long sys_io_submit(aio_context_t, long, struct iocb __user * __user *); asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, struct io_event __user *result); -asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, - off_t __user *offset, size_t count); -asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, - loff_t __user *offset, size_t count); +asmlinkage long sys_sendfile(int out_fd, int in_fd, + off_t __user *offset, size_t count); +asmlinkage long sys_sendfile64(int out_fd, int in_fd, + loff_t __user *offset, size_t count); asmlinkage long sys_readlink(const char __user *path, char __user *buf, int bufsiz); asmlinkage long sys_creat(const char __user *pathname, int mode); @@ -368,26 +368,25 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times); asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes); -asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, - unsigned int origin); +asmlinkage long sys_lseek(unsigned int fd, off_t offset, + unsigned int origin); asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high, unsigned long offset_low, loff_t __user *result, unsigned int origin); -asmlinkage ssize_t sys_read(unsigned int fd, char __user *buf, - size_t count); -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count); -asmlinkage ssize_t sys_readv(unsigned long fd, - const struct iovec __user *vec, - unsigned long vlen); -asmlinkage ssize_t sys_write(unsigned int fd, const char __user *buf, - size_t count); -asmlinkage ssize_t sys_writev(unsigned long fd, - const struct iovec __user *vec, - unsigned long vlen); -asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, - size_t count, loff_t pos); -asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, - size_t count, loff_t pos); +asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count); +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count); +asmlinkage long sys_readv(unsigned long fd, + const struct iovec __user *vec, + unsigned long vlen); +asmlinkage long sys_write(unsigned int fd, const char __user *buf, + size_t count); +asmlinkage long sys_writev(unsigned long fd, + const struct iovec __user *vec, + unsigned long vlen); +asmlinkage long sys_pread64(unsigned int fd, char __user *buf, + size_t count, loff_t pos); +asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, + size_t count, loff_t pos); asmlinkage long sys_getcwd(char __user *buf, unsigned long size); asmlinkage long sys_mkdir(const char __user *pathname, int mode); asmlinkage long sys_chdir(const char __user *filename); @@ -476,7 +475,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); asmlinkage long sys_mq_unlink(const char __user *name); asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); -asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 23fdb8492b8e..6df028b70543 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -907,7 +907,7 @@ out: return ret; } -asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout) { diff --git a/kernel/exit.c b/kernel/exit.c index c7740fa3252c..fac9b040af2c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1182,9 +1182,11 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage void sys_exit_group(int error_code) +asmlinkage long sys_exit_group(int error_code) { do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; } static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) diff --git a/kernel/signal.c b/kernel/signal.c index 3152ac3b62e2..856a5479d49d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2559,7 +2559,7 @@ sys_ssetmask(int newmask) /* * For backwards compatibility. Functionality superseded by sigaction. */ -asmlinkage unsigned long +asmlinkage long sys_signal(int sig, __sighandler_t handler) { struct k_sigaction new_sa, old_sa; diff --git a/kernel/timer.c b/kernel/timer.c index dee3f641a7a7..7b8697d7f04d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks) * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ -asmlinkage unsigned long sys_alarm(unsigned int seconds) +asmlinkage long sys_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } diff --git a/mm/filemap.c b/mm/filemap.c index ceba0bd03662..538b75ed6236 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; diff --git a/mm/mmap.c b/mm/mmap.c index 749623196cb9..a970d890cb21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; diff --git a/mm/mremap.c b/mm/mremap.c index 646de959aa58..5572e0825d80 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,7 +420,7 @@ out_nc: return ret; } -asmlinkage unsigned long sys_mremap(unsigned long addr, +asmlinkage long sys_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { diff --git a/mm/nommu.c b/mm/nommu.c index 60ed8375c986..ee3e78927739 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { struct mm_struct *mm = current->mm; -- cgit v1.2.3 From 6a6160a7b5c27b3c38651baef92a14fa7072b3c1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:15 +0100 Subject: [CVE-2009-0029] System call wrappers part 13 Signed-off-by: Heiko Carstens --- fs/xattr.c | 7 +++---- mm/fremap.c | 4 ++-- mm/mlock.c | 4 ++-- mm/mmap.c | 4 ++-- mm/mprotect.c | 4 ++-- mm/mremap.c | 6 +++--- mm/msync.c | 2 +- mm/nommu.c | 11 +++++------ 8 files changed, 20 insertions(+), 22 deletions(-) (limited to 'mm/mmap.c') diff --git a/fs/xattr.c b/fs/xattr.c index 0367a5dae2b8..197c4fcac032 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -499,8 +499,8 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname, return error; } -asmlinkage long -sys_lremovexattr(const char __user *pathname, const char __user *name) +SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, + const char __user *, name) { struct path path; int error; @@ -517,8 +517,7 @@ sys_lremovexattr(const char __user *pathname, const char __user *name) return error; } -asmlinkage long -sys_fremovexattr(int fd, const char __user *name) +SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { struct file *f; struct dentry *dentry; diff --git a/mm/fremap.c b/mm/fremap.c index 62d5bbda921a..736ba7f3306a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, * and the vma's default protection is used. Arbitrary protections * might be implemented in the future. */ -asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long prot, unsigned long pgoff, unsigned long flags) +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct address_space *mapping; diff --git a/mm/mlock.c b/mm/mlock.c index e125156c664e..04d5e7429c55 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -530,7 +530,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -asmlinkage long sys_mlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; unsigned long lock_limit; @@ -558,7 +558,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) return error; } -asmlinkage long sys_munlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; diff --git a/mm/mmap.c b/mm/mmap.c index a970d890cb21..8d95902e9a38 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; @@ -1948,7 +1948,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; diff --git a/mm/mprotect.c b/mm/mprotect.c index d0f6e7ce09f1..abe2694e13f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -217,8 +217,8 @@ fail: return error; } -asmlinkage long -sys_mprotect(unsigned long start, size_t len, unsigned long prot) +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) { unsigned long vm_flags, nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; diff --git a/mm/mremap.c b/mm/mremap.c index 5572e0825d80..a39b7b91be46 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,9 +420,9 @@ out_nc: return ret; } -asmlinkage long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; diff --git a/mm/msync.c b/mm/msync.c index 07dae08cf31c..4083209b7f02 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -28,7 +28,7 @@ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) { unsigned long end; struct mm_struct *mm = current->mm; diff --git a/mm/nommu.c b/mm/nommu.c index ee3e78927739..8cee8c8ff0f2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { struct mm_struct *mm = current->mm; @@ -1573,7 +1573,7 @@ erase_whole_vma: } EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; @@ -1657,10 +1657,9 @@ unsigned long do_mremap(unsigned long addr, } EXPORT_SYMBOL(do_mremap); -asmlinkage -unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; -- cgit v1.2.3 From de33c8db5910cda599899dd431cc30d7c1018cbf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Jan 2009 17:46:42 -0800 Subject: Fix OOPS in mmap_region() when merging adjacent VM_LOCKED file segments As of commit ba470de43188cdbff795b5da43a1474523c6c2fb ("map: handle mlocked pages during map, remap, unmap") we now use the 'vma' variable at the end of mmap_region() to handle the page-in of newly mapped mlocked pages. However, if we merged adjacent vma's together, the vma we're using may be stale. We historically consciously avoided using it after the merge operation, but that got overlooked when redoing the locked page handling. This commit simplifies mmap_region() by doing any vma merges early, avoiding the issue entirely, and 'vma' will always be valid. As pointed out by Hugh Dickins, this depends on any drivers that change the page offset of flags to have set one of the VM_SPECIAL bits (so that they cannot trigger the early merge logic), but that's true in general. Reported-and-tested-by: Maksim Yevmenkin Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Andrew Morton Cc: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/mmap.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 8d95902e9a38..d3fa10a726cf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1134,16 +1134,11 @@ munmap_back: } /* - * Can we just expand an old private anonymous mapping? - * The VM_SHARED test is necessary because shmem_zero_setup - * will create the file object for a shared anonymous map below. + * Can we just expand an old mapping? */ - if (!file && !(vm_flags & VM_SHARED)) { - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL); - if (vma) - goto out; - } + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); + if (vma) + goto out; /* * Determine the object being mapped and call the appropriate @@ -1206,17 +1201,8 @@ munmap_back: if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); - if (file && vma_merge(mm, prev, addr, vma->vm_end, - vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { - mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); - fput(file); - if (vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } else { - vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - } + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; /* Once vma denies write, undo our temporary denial count */ if (correct_wcount) -- cgit v1.2.3 From 33bfad54b58cf05cfe6678c3ec9235d4bc8db4c2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 30 Jan 2009 11:37:22 -0800 Subject: Allow opportunistic merging of VM_CAN_NONLINEAR areas Commit de33c8db5910cda599899dd431cc30d7c1018cbf ("Fix OOPS in mmap_region() when merging adjacent VM_LOCKED file segments") unified the vma merging of anonymous and file maps to just one place, which simplified the code and fixed a use-after-free bug that could cause an oops. But by doing the merge opportunistically before even having called ->mmap() on the file method, it now compares two different 'vm_flags' values: the pre-mmap() value of the new not-yet-formed vma, and previous mappings of the same file around it. And in doing so, it refused to merge the common file case, which adds a marker to say "I can be made non-linear". This fixes it by just adding a set of flags that don't have to match, because we know they are ok to merge. Currently it's only that single VM_CAN_NONLINEAR flag, but at least conceptually there could be others in the future. Reported-and-acked-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Andrew Morton Cc: Greg KH Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index d3fa10a726cf..c581df14d0de 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -658,6 +658,9 @@ again: remove_next = 1 + (end > next->vm_end); validate_mm(mm); } +/* Flags that can be inherited from an existing mapping when merging */ +#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -665,7 +668,7 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if (vma->vm_flags != vm_flags) + if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) return 0; if (vma->vm_file != file) return 0; -- cgit v1.2.3 From fc8744adc870a8d4366908221508bb113d8b72ee Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 31 Jan 2009 15:08:56 -0800 Subject: Stop playing silly games with the VM_ACCOUNT flag The mmap_region() code would temporarily set the VM_ACCOUNT flag for anonymous shared mappings just to inform shmem_zero_setup() that it should enable accounting for the resulting shm object. It would then clear the flag after calling ->mmap (for the /dev/zero case) or doing shmem_zero_setup() (for the MAP_ANON case). This just resulted in vma merge issues, but also made for just unnecessary confusion. Use the already-existing VM_NORESERVE flag for this instead, and let shmem_{zero|file}_setup() just figure it out from that. This also happens to make it obvious that the new DRI2 GEM layer uses a non-reserving backing store for its object allocation - which is quite possibly not intentional. But since I didn't want to change semantics in this patch, I left it alone, and just updated the caller to use the new flag semantics. Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_gem.c | 2 +- ipc/shm.c | 4 ++-- mm/mmap.c | 48 ++++++++++++++++++++++++----------------------- mm/shmem.c | 2 +- 4 files changed, 29 insertions(+), 27 deletions(-) (limited to 'mm/mmap.c') diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 9da581452874..6915fb82d0b0 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -136,7 +136,7 @@ drm_gem_object_alloc(struct drm_device *dev, size_t size) obj = kcalloc(1, sizeof(*obj), GFP_KERNEL); obj->dev = dev; - obj->filp = shmem_file_setup("drm mm object", size, 0); + obj->filp = shmem_file_setup("drm mm object", size, VM_NORESERVE); if (IS_ERR(obj->filp)) { kfree(obj); return NULL; diff --git a/ipc/shm.c b/ipc/shm.c index a9e09ad2263e..c0a021f7f41a 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -368,14 +368,14 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) file = hugetlb_file_setup(name, size); shp->mlock_user = current_user(); } else { - int acctflag = VM_ACCOUNT; + int acctflag = 0; /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. */ if ((shmflg & SHM_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) - acctflag = 0; + acctflag = VM_NORESERVE; file = shmem_file_setup(name, size, acctflag); } error = PTR_ERR(file); diff --git a/mm/mmap.c b/mm/mmap.c index c581df14d0de..214b6a258eeb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1090,6 +1090,15 @@ int vma_wants_writenotify(struct vm_area_struct *vma) mapping_cap_account_dirty(vma->vm_file->f_mapping); } +/* + * We account for memory if it's a private writeable mapping, + * and VM_NORESERVE wasn't set. + */ +static inline int accountable_mapping(unsigned int vm_flags) +{ + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; +} + unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, unsigned int vm_flags, unsigned long pgoff, @@ -1117,23 +1126,24 @@ munmap_back: if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - if (flags & MAP_NORESERVE) + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. We only honor MAP_NORESERVE + * if we're allowed to overcommit memory. + */ + if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + if (!accountable) vm_flags |= VM_NORESERVE; - if (accountable && (!(flags & MAP_NORESERVE) || - sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { - if (vm_flags & VM_SHARED) { - /* Check memory availability in shmem_file_setup? */ - vm_flags |= VM_ACCOUNT; - } else if (vm_flags & VM_WRITE) { - /* - * Private writable mapping: check memory availability - */ - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; } /* @@ -1184,14 +1194,6 @@ munmap_back: goto free_vma; } - /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform - * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) - * that memory reservation must be checked; but that reservation - * belongs to shared memory object, not to vma: so now clear it. - */ - if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) - vma->vm_flags &= ~VM_ACCOUNT; - /* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their diff --git a/mm/shmem.c b/mm/shmem.c index 5d0de96c9789..19d566ccdeea 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2628,7 +2628,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto close_file; #ifdef CONFIG_SHMEM - SHMEM_I(inode)->flags = flags & VM_ACCOUNT; + SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT; #endif d_instantiate(dentry, inode); inode->i_size = size; -- cgit v1.2.3