summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c1
-rw-r--r--mm/filemap.c24
-rw-r--r--mm/filemap_xip.c22
-rw-r--r--mm/fremap.c86
-rw-r--r--mm/highmem.c14
-rw-r--r--mm/hugetlb.c207
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c993
-rw-r--r--mm/memory_hotplug.c138
-rw-r--r--mm/mempolicy.c463
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/mmap.c128
-rw-r--r--mm/mprotect.c19
-rw-r--r--mm/mremap.c193
-rw-r--r--mm/msync.c78
-rw-r--r--mm/nommu.c18
-rw-r--r--mm/page_alloc.c240
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/pdflush.c13
-rw-r--r--mm/rmap.c146
-rw-r--r--mm/shmem.c32
-rw-r--r--mm/slab.c13
-rw-r--r--mm/sparse.c99
-rw-r--r--mm/swap.c7
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c41
-rw-r--r--mm/thrash.c2
-rw-r--r--mm/tiny-shmem.c5
-rw-r--r--mm/truncate.c11
-rw-r--r--mm/vmalloc.c77
-rw-r--r--mm/vmscan.c14
33 files changed, 1746 insertions, 1382 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d136..1a4473fcb2ca 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,24 @@ config SPARSEMEM_STATIC
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+ bool "Allow for memory hot-add"
+ depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+
+comment "Memory hotplug is currently incompatible with Software Suspend"
+ depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "4096" if ARM && !CPU_CACHE_VIPT
+ default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+ default "4"
diff --git a/mm/Makefile b/mm/Makefile
index 4cd69e3ce421..2fa6d2ca9f28 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
-
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a58699b6579e..e8c567177dcf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
if (j + 16 < BITS_PER_LONG)
prefetchw(page + j + 16);
__ClearPageReserved(page + j);
+ set_page_count(page + j, 0);
}
__free_pages(page, order);
i += BITS_PER_LONG;
diff --git a/mm/filemap.c b/mm/filemap.c
index b5346576e58d..5d6e4c2000dc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
*
* ->mmap_sem
* ->i_mmap_lock
- * ->page_table_lock (various places, mainly in mmap.c)
+ * ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
- * ->page_table_lock (anon_vma_prepare and various)
+ * ->page_table_lock or pte_lock (anon_vma_prepare and various)
*
- * ->page_table_lock
+ * ->page_table_lock or pte_lock
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
@@ -152,7 +152,7 @@ static int sync_page(void *word)
* in the ->sync_page() methods make essential use of the
* page_mapping(), merely passing the page down to the backing
* device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page->private is
+ * ignore it for all cases but swap, where only page_private(page) is
* of interest. When page_mapping() does go NULL, the entire
* call stack gracefully ignores the page and returns.
* -- wli
@@ -377,7 +377,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
* This function does not add the page to the LRU. The caller must do that.
*/
int add_to_page_cache(struct page *page, struct address_space *mapping,
- pgoff_t offset, int gfp_mask)
+ pgoff_t offset, gfp_t gfp_mask)
{
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
@@ -401,7 +401,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
EXPORT_SYMBOL(add_to_page_cache);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
- pgoff_t offset, int gfp_mask)
+ pgoff_t offset, gfp_t gfp_mask)
{
int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
if (ret == 0)
@@ -591,7 +591,7 @@ EXPORT_SYMBOL(find_lock_page);
* memory exhaustion.
*/
struct page *find_or_create_page(struct address_space *mapping,
- unsigned long index, unsigned int gfp_mask)
+ unsigned long index, gfp_t gfp_mask)
{
struct page *page, *cached_page = NULL;
int err;
@@ -683,7 +683,7 @@ struct page *
grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
{
struct page *page = find_get_page(mapping, index);
- unsigned int gfp_mask;
+ gfp_t gfp_mask;
if (page) {
if (!TestSetPageLocked(page))
@@ -1030,8 +1030,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
desc.error = 0;
do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval += desc.written;
- if (!retval) {
- retval = desc.error;
+ if (desc.error) {
+ retval = retval ?: desc.error;
break;
}
}
@@ -1520,7 +1520,7 @@ repeat:
page_cache_release(page);
return err;
}
- } else {
+ } else if (vma->vm_flags & VM_NONLINEAR) {
/* No page was found just because we can't read it in now (being
* here implies nonblock != 0), but the page may exist, so set
* the PTE to fault it in later. */
@@ -1537,6 +1537,7 @@ repeat:
return 0;
}
+EXPORT_SYMBOL(filemap_populate);
struct vm_operations_struct generic_file_vm_ops = {
.nopage = filemap_nopage,
@@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
-EXPORT_SYMBOL(filemap_populate);
/*
* This is for filesystems which do not implement ->writepage.
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f537732..9cf687e4a29a 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,8 @@ __xip_unmap (struct address_space * mapping,
unsigned long address;
pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
+ struct page *page;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -181,19 +183,17 @@ __xip_unmap (struct address_space * mapping,
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- pte = page_check_address(ZERO_PAGE(address), mm,
- address);
- if (!IS_ERR(pte)) {
+ page = ZERO_PAGE(address);
+ pte = page_check_address(page, mm, address, &ptl);
+ if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
+ page_remove_rmap(page);
+ dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
+ page_cache_release(page);
}
}
spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area,
page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
if (!IS_ERR(page)) {
- return page;
+ goto out;
}
if (PTR_ERR(page) != -ENODATA)
return NULL;
@@ -249,6 +249,8 @@ xip_file_nopage(struct vm_area_struct * area,
page = ZERO_PAGE(address);
}
+out:
+ page_cache_get(page);
return page;
}
diff --git a/mm/fremap.c b/mm/fremap.c
index ab23a0673c35..d862be3bc3e3 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,33 +20,32 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
+ struct page *page = NULL;
- if (pte_none(pte))
- return;
if (pte_present(pte)) {
unsigned long pfn = pte_pfn(pte);
-
flush_cache_page(vma, addr, pfn);
pte = ptep_clear_flush(vma, addr, ptep);
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
- if (!PageReserved(page)) {
- if (pte_dirty(pte))
- set_page_dirty(page);
- page_remove_rmap(page);
- page_cache_release(page);
- dec_mm_counter(mm, rss);
- }
+ if (unlikely(!pfn_valid(pfn))) {
+ print_bad_pte(vma, pte, addr);
+ goto out;
}
+ page = pfn_to_page(pfn);
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+ page_remove_rmap(page);
+ page_cache_release(page);
} else {
if (!pte_file(pte))
free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear(mm, addr, ptep);
}
+out:
+ return !!page;
}
/*
@@ -64,21 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud;
pgd_t *pgd;
pte_t pte_val;
+ spinlock_t *ptl;
+
+ BUG_ON(vma->vm_flags & VM_RESERVED);
pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
- goto err_unlock;
-
+ goto out;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
- goto err_unlock;
-
- pte = pte_alloc_map(mm, pmd, addr);
+ goto out;
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- goto err_unlock;
+ goto out;
/*
* This page may have been truncated. Tell the
@@ -88,29 +86,27 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
inode = vma->vm_file->f_mapping->host;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (!page->mapping || page->index >= size)
- goto err_unlock;
+ goto unlock;
err = -ENOMEM;
if (page_mapcount(page) > INT_MAX/2)
- goto err_unlock;
+ goto unlock;
- zap_pte(mm, vma, addr, pte);
+ if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
+ inc_mm_counter(mm, file_rss);
- inc_mm_counter(mm,rss);
flush_icache_page(vma, page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
page_add_file_rmap(page);
pte_val = *pte;
- pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
-
err = 0;
-err_unlock:
- spin_unlock(&mm->page_table_lock);
+unlock:
+ pte_unmap_unlock(pte, ptl);
+out:
return err;
}
EXPORT_SYMBOL(install_page);
-
/*
* Install a file pte to a given virtual memory address, release any
* previously existing mapping.
@@ -124,37 +120,35 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud;
pgd_t *pgd;
pte_t pte_val;
+ spinlock_t *ptl;
+
+ BUG_ON(vma->vm_flags & VM_RESERVED);
pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
- goto err_unlock;
-
+ goto out;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
- goto err_unlock;
-
- pte = pte_alloc_map(mm, pmd, addr);
+ goto out;
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- goto err_unlock;
+ goto out;
- zap_pte(mm, vma, addr, pte);
+ if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, file_rss);
+ }
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
pte_val = *pte;
- pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
- spin_unlock(&mm->page_table_lock);
- return 0;
-
-err_unlock:
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
+ err = 0;
+out:
return err;
}
-
/***
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
* file within an existing vma.
diff --git a/mm/highmem.c b/mm/highmem.c
index 90e1861e2da0..ce2e7e8bbfa7 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,11 +30,9 @@
static mempool_t *page_pool, *isa_page_pool;
-static void *page_pool_alloc(gfp_t gfp_mask, void *data)
+static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
{
- unsigned int gfp = gfp_mask | (unsigned int) (long) data;
-
- return alloc_page(gfp);
+ return alloc_page(gfp_mask | GFP_DMA);
}
static void page_pool_free(void *page, void *data)
@@ -51,6 +49,12 @@ static void page_pool_free(void *page, void *data)
* n means that there are (n-1) current users of it.
*/
#ifdef CONFIG_HIGHMEM
+
+static void *page_pool_alloc(gfp_t gfp_mask, void *data)
+{
+ return alloc_page(gfp_mask);
+}
+
static int pkmap_count[LAST_PKMAP];
static unsigned int last_pkmap_nr;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -267,7 +271,7 @@ int init_emergency_isa_pool(void)
if (isa_page_pool)
return 0;
- isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
+ isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
if (!isa_page_pool)
BUG();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61d380678030..c9b43360fd33 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
unsigned long addr;
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ src_pte = huge_pte_offset(src, addr);
+ if (!src_pte)
+ continue;
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
+ spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
- src_pte = huge_pte_offset(src, addr);
- if (src_pte && !pte_none(*src_pte)) {
+ if (!pte_none(*src_pte)) {
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
- add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+ add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
+ spin_unlock(&dst->page_table_lock);
}
return 0;
@@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
+ spin_lock(&mm->page_table_lock);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
- if (! ptep)
- /* This can happen on truncate, or if an
- * mmap() is aborted due to an error before
- * the prefault */
+ if (!ptep)
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
page = pte_page(pte);
put_page(page);
- add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE));
+ add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
}
- flush_tlb_range(vma, start, end);
-}
-
-void zap_hugepage_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long length)
-{
- struct mm_struct *mm = vma->vm_mm;
- spin_lock(&mm->page_table_lock);
- unmap_hugepage_range(vma, start, start + length);
spin_unlock(&mm->page_table_lock);
+ flush_tlb_range(vma, start, end);
}
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *find_lock_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
- BUG_ON(vma->vm_start & ~HPAGE_MASK);
- BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
- hugetlb_prefault_arch_hook(mm);
-
- spin_lock(&mm->page_table_lock);
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
- unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
-
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
+ struct page *page;
+ int err;
+ struct inode *inode = mapping->host;
+ unsigned long size;
+
+retry:
+ page = find_lock_page(mapping, idx);
+ if (page)
+ goto out;
+
+ /* Check to make sure the mapping hasn't been truncated */
+ size = i_size_read(inode) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto out;
+
+ if (hugetlb_get_quota(mapping))
+ goto out;
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- page = find_get_page(mapping, idx);
- if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
- goto out;
- }
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = -ENOMEM;
- goto out;
- }
- ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
- if (! ret) {
- unlock_page(page);
- } else {
- hugetlb_put_quota(mapping);
- free_huge_page(page);
- goto out;
- }
- }
- add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
}
out:
- spin_unlock(&mm->page_table_lock);
- return ret;
+ return page;
}
-/*
- * On ia64 at least, it is possible to receive a hugetlb fault from a
- * stale zero entry left in the TLB from earlier hardware prefetching.
- * Low-level arch code should already have flushed the stale entry as
- * part of its fault handling, but we do need to accept this minor fault
- * and return successfully. Whereas the "normal" case is that this is
- * an access to a hugetlb page which has been truncated off since mmap.
- */
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
int ret = VM_FAULT_SIGBUS;
+ unsigned long idx;
+ unsigned long size;
pte_t *pte;
+ struct page *page;
+ struct address_space *mapping;
+
+ pte = huge_pte_alloc(mm, address);
+ if (!pte)
+ goto out;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+ /*
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
+ */
+ page = find_lock_huge_page(mapping, idx);
+ if (!page)
+ goto out;
spin_lock(&mm->page_table_lock);
- pte = huge_pte_offset(mm, address);
- if (pte && !pte_none(*pte))
- ret = VM_FAULT_MINOR;
+ size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto backout;
+
+ ret = VM_FAULT_MINOR;
+ if (!pte_none(*pte))
+ goto backout;
+
+ add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+out:
return ret;
+
+backout:
+ spin_unlock(&mm->page_table_lock);
+ hugetlb_put_quota(mapping);
+ unlock_page(page);
+ put_page(page);
+ goto out;
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vpfn, vaddr = *position;
int remainder = *length;
- BUG_ON(!is_vm_hugetlb_page(vma));
-
vpfn = vaddr/PAGE_SIZE;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ struct page *page;
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- /* Some archs (sparc64, sh*) have multiple
- * pte_ts to each hugepage. We have to make
- * sure we get the first, for the page
- * indexing below to work. */
- pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-
- /* the hugetlb file might have been truncated */
- if (!pte || pte_none(*pte)) {
- remainder = 0;
- if (!i)
- i = -EFAULT;
- break;
- }
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make * sure we get the
+ * first, for the page indexing below to work.
+ */
+ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+ if (!pte || pte_none(*pte)) {
+ int ret;
- WARN_ON(!PageCompound(page));
+ spin_unlock(&mm->page_table_lock);
+ ret = hugetlb_fault(mm, vma, vaddr, 0);
+ spin_lock(&mm->page_table_lock);
+ if (ret == VM_FAULT_MINOR)
+ continue;
+
+ remainder = 0;
+ if (!i)
+ i = -EFAULT;
+ break;
+ }
+ if (pages) {
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
get_page(page);
pages[i] = page;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e075d1c64c..17aaf3e16449 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
unsigned long start, unsigned long end)
{
*prev = vma;
- if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
return -EINVAL;
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index 1db40e935e55..0f60baf6f69b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
+ pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
@@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb,
free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
- if (!tlb_is_full_mm(*tlb))
+ if (!(*tlb)->fullmm)
flush_tlb_pgtables((*tlb)->mm, start, end);
}
@@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
+ /*
+ * Hide vma from rmap and vmtruncate before freeing pgtables
+ */
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
+
if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
@@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
HPAGE_SIZE)) {
vma = next;
next = vma->vm_next;
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
@@ -280,72 +289,78 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
}
}
-pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- struct page *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free(new);
- goto out;
- }
+ struct page *new = pte_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ pte_lock_init(new);
+ spin_lock(&mm->page_table_lock);
+ if (pmd_present(*pmd)) { /* Another has populated it */
+ pte_lock_deinit(new);
+ pte_free(new);
+ } else {
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
}
-out:
- return pte_offset_map(pmd, address);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
}
-pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- pte_t *new;
+ pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ if (!new)
+ return -ENOMEM;
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one_kernel(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
+ spin_lock(&init_mm.page_table_lock);
+ if (pmd_present(*pmd)) /* Another has populated it */
+ pte_free_kernel(new);
+ else
+ pmd_populate_kernel(&init_mm, pmd, new);
+ spin_unlock(&init_mm.page_table_lock);
+ return 0;
+}
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free_kernel(new);
- goto out;
- }
- pmd_populate_kernel(mm, pmd, new);
- }
-out:
- return pte_offset_kernel(pmd, address);
+static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+{
+ if (file_rss)
+ add_mm_counter(mm, file_rss, file_rss);
+ if (anon_rss)
+ add_mm_counter(mm, anon_rss, anon_rss);
+}
+
+/*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+ printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+ "vm_flags = %lx, vaddr = %lx\n",
+ (long long)pte_val(pte),
+ (vma->vm_mm == current->mm ? current->comm : "???"),
+ vma->vm_flags, vaddr);
+ dump_stack();
}
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
*/
static inline void
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
- unsigned long addr)
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+ unsigned long addr, int *rss)
{
+ unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
unsigned long pfn;
@@ -357,29 +372,32 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
- list_add(&dst_mm->mmlist, &src_mm->mmlist);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
}
- set_pte_at(dst_mm, addr, dst_pte, pte);
- return;
+ goto out_set_pte;
}
- pfn = pte_pfn(pte);
- /* the pte points outside of valid memory, the
- * mapping is assumed to be good, meaningful
- * and not mapped via rmap - duplicate the
- * mapping as is.
+ /* If the region is VM_RESERVED, the mapping is not
+ * mapped via rmap - duplicate the pte as is.
*/
- page = NULL;
- if (pfn_valid(pfn))
- page = pfn_to_page(pfn);
+ if (vm_flags & VM_RESERVED)
+ goto out_set_pte;
- if (!page || PageReserved(page)) {
- set_pte_at(dst_mm, addr, dst_pte, pte);
- return;
+ pfn = pte_pfn(pte);
+ /* If the pte points outside of valid memory but
+ * the region is not VM_RESERVED, we have a problem.
+ */
+ if (unlikely(!pfn_valid(pfn))) {
+ print_bad_pte(vma, pte, addr);
+ goto out_set_pte; /* try to do something sane */
}
+ page = pfn_to_page(pfn);
+
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
@@ -397,11 +415,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
get_page(page);
- inc_mm_counter(dst_mm, rss);
- if (PageAnon(page))
- inc_mm_counter(dst_mm, anon_rss);
- set_pte_at(dst_mm, addr, dst_pte, pte);
page_dup_rmap(page);
+ rss[!!PageAnon(page)]++;
+
+out_set_pte:
+ set_pte_at(dst_mm, addr, dst_pte, pte);
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -409,38 +427,44 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long addr, unsigned long end)
{
pte_t *src_pte, *dst_pte;
- unsigned long vm_flags = vma->vm_flags;
- int progress;
+ spinlock_t *src_ptl, *dst_ptl;
+ int progress = 0;
+ int rss[2];
again:
- dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+ rss[1] = rss[0] = 0;
+ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_ptl = pte_lockptr(src_mm, src_pmd);
+ spin_lock(src_ptl);
- progress = 0;
- spin_lock(&src_mm->page_table_lock);
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
- if (progress >= 32 && (need_resched() ||
- need_lockbreak(&src_mm->page_table_lock) ||
- need_lockbreak(&dst_mm->page_table_lock)))
- break;
+ if (progress >= 32) {
+ progress = 0;
+ if (need_resched() ||
+ need_lockbreak(src_ptl) ||
+ need_lockbreak(dst_ptl))
+ break;
+ }
if (pte_none(*src_pte)) {
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
+ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
- spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
- pte_unmap(dst_pte - 1);
- cond_resched_lock(&dst_mm->page_table_lock);
+ add_mm_rss(dst_mm, rss[0], rss[1]);
+ pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ cond_resched();
if (addr != end)
goto again;
return 0;
@@ -525,24 +549,30 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return 0;
}
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
+ struct mm_struct *mm = tlb->mm;
pte_t *pte;
+ spinlock_t *ptl;
+ int file_rss = 0;
+ int anon_rss = 0;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
do {
pte_t ptent = *pte;
if (pte_none(ptent))
continue;
if (pte_present(ptent)) {
struct page *page = NULL;
- unsigned long pfn = pte_pfn(ptent);
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (PageReserved(page))
- page = NULL;
+ if (!(vma->vm_flags & VM_RESERVED)) {
+ unsigned long pfn = pte_pfn(ptent);
+ if (unlikely(!pfn_valid(pfn)))
+ print_bad_pte(vma, ptent, addr);
+ else
+ page = pfn_to_page(pfn);
}
if (unlikely(details) && page) {
/*
@@ -562,7 +592,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
page->index > details->last_index))
continue;
}
- ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
@@ -570,15 +600,17 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
addr) != page->index)
- set_pte_at(tlb->mm, addr, pte,
+ set_pte_at(mm, addr, pte,
pgoff_to_pte(page->index));
- if (pte_dirty(ptent))
- set_page_dirty(page);
if (PageAnon(page))
- dec_mm_counter(tlb->mm, anon_rss);
- else if (pte_young(ptent))
- mark_page_accessed(page);
- tlb->freed++;
+ anon_rss--;
+ else {
+ if (pte_dirty(ptent))
+ set_page_dirty(page);
+ if (pte_young(ptent))
+ mark_page_accessed(page);
+ file_rss--;
+ }
page_remove_rmap(page);
tlb_remove_page(tlb, page);
continue;
@@ -591,12 +623,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
continue;
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
- pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
+ pte_clear_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+
+ add_mm_rss(mm, file_rss, anon_rss);
+ pte_unmap_unlock(pte - 1, ptl);
}
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
@@ -608,11 +643,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- zap_pte_range(tlb, pmd, addr, next, details);
+ zap_pte_range(tlb, vma, pmd, addr, next, details);
} while (pmd++, addr = next, addr != end);
}
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
@@ -624,7 +660,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- zap_pmd_range(tlb, pud, addr, next, details);
+ zap_pmd_range(tlb, vma, pud, addr, next, details);
} while (pud++, addr = next, addr != end);
}
@@ -645,7 +681,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- zap_pud_range(tlb, pgd, addr, next, details);
+ zap_pud_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
}
@@ -660,7 +696,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlbp: address of the caller's struct mmu_gather
- * @mm: the controlling mm_struct
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
@@ -669,10 +704,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
*
* Returns the end address of the unmapping (restart addr if interrupted).
*
- * Unmap all pages in the vma list. Called under page_table_lock.
+ * Unmap all pages in the vma list.
*
- * We aim to not hold page_table_lock for too long (for scheduling latency
- * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
+ * We aim to not hold locks for too long (for scheduling latency reasons).
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
* return the ending mmu_gather to the caller.
*
* Only addresses between `start' and `end' will be unmapped.
@@ -684,7 +719,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlbp,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
@@ -694,7 +729,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
int tlb_start_valid = 0;
unsigned long start = start_addr;
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
- int fullmm = tlb_is_full_mm(*tlbp);
+ int fullmm = (*tlbp)->fullmm;
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long end;
@@ -734,19 +769,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
tlb_finish_mmu(*tlbp, tlb_start, start);
if (need_resched() ||
- need_lockbreak(&mm->page_table_lock) ||
(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
if (i_mmap_lock) {
- /* must reset count of rss freed */
- *tlbp = tlb_gather_mmu(mm, fullmm);
+ *tlbp = NULL;
goto out;
}
- spin_unlock(&mm->page_table_lock);
cond_resched();
- spin_lock(&mm->page_table_lock);
}
- *tlbp = tlb_gather_mmu(mm, fullmm);
+ *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
tlb_start_valid = 0;
zap_bytes = ZAP_BLOCK_SIZE;
}
@@ -770,123 +801,93 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long end = address + size;
unsigned long nr_accounted = 0;
- if (is_vm_hugetlb_page(vma)) {
- zap_hugepage_range(vma, address, size);
- return end;
- }
-
lru_add_drain();
- spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
- end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
- tlb_finish_mmu(tlb, address, end);
- spin_unlock(&mm->page_table_lock);
+ update_hiwater_rss(mm);
+ end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+ if (tlb)
+ tlb_finish_mmu(tlb, address, end);
return end;
}
/*
* Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
*/
-static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
- int read, int write, int accessed)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+ unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
+ spinlock_t *ptl;
unsigned long pfn;
struct page *page;
- page = follow_huge_addr(mm, address, write);
- if (! IS_ERR(page))
- return page;
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ BUG_ON(flags & FOLL_GET);
+ goto out;
+ }
+ page = NULL;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- goto out;
+ goto no_page_table;
pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
- goto out;
+ goto no_page_table;
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto no_page_table;
+
+ if (pmd_huge(*pmd)) {
+ BUG_ON(flags & FOLL_GET);
+ page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
- if (pmd_huge(*pmd))
- return follow_huge_pmd(mm, address, pmd, write);
+ }
- ptep = pte_offset_map(pmd, address);
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
goto out;
pte = *ptep;
- pte_unmap(ptep);
- if (pte_present(pte)) {
- if (write && !pte_write(pte))
- goto out;
- if (read && !pte_read(pte))
- goto out;
- pfn = pte_pfn(pte);
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (accessed) {
- if (write && !pte_dirty(pte) &&!PageDirty(page))
- set_page_dirty(page);
- mark_page_accessed(page);
- }
- return page;
- }
+ if (!pte_present(pte))
+ goto unlock;
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+ pfn = pte_pfn(pte);
+ if (!pfn_valid(pfn))
+ goto unlock;
+
+ page = pfn_to_page(pfn);
+ if (flags & FOLL_GET)
+ get_page(page);
+ if (flags & FOLL_TOUCH) {
+ if ((flags & FOLL_WRITE) &&
+ !pte_dirty(pte) && !PageDirty(page))
+ set_page_dirty(page);
+ mark_page_accessed(page);
}
-
+unlock:
+ pte_unmap_unlock(ptep, ptl);
out:
- return NULL;
-}
-
-inline struct page *
-follow_page(struct mm_struct *mm, unsigned long address, int write)
-{
- return __follow_page(mm, address, 0, write, 1);
-}
-
-/*
- * check_user_page_readable() can be called frm niterrupt context by oprofile,
- * so we need to avoid taking any non-irq-safe locks
- */
-int check_user_page_readable(struct mm_struct *mm, unsigned long address)
-{
- return __follow_page(mm, address, 1, 0, 0) != NULL;
-}
-EXPORT_SYMBOL(check_user_page_readable);
-
-static inline int
-untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
- unsigned long address)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- /* Check if the vma is for an anonymous mapping. */
- if (vma->vm_ops && vma->vm_ops->nopage)
- return 0;
-
- /* Check if page directory entry exists. */
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- return 1;
-
- pud = pud_offset(pgd, address);
- if (pud_none(*pud) || unlikely(pud_bad(*pud)))
- return 1;
-
- /* Check if page middle directory entry exists. */
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- return 1;
+ return page;
- /* There is a pte slot for 'address' in 'mm'. */
- return 0;
+no_page_table:
+ /*
+ * When core dumping an enormous anonymous area that nobody
+ * has touched so far, we don't want to allocate page tables.
+ */
+ if (flags & FOLL_ANON) {
+ page = ZERO_PAGE(address);
+ if (flags & FOLL_GET)
+ get_page(page);
+ BUG_ON(flags & FOLL_WRITE);
+ }
+ return page;
}
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +895,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
struct page **pages, struct vm_area_struct **vmas)
{
int i;
- unsigned int flags;
+ unsigned int vm_flags;
/*
* Require read or write permissions.
* If 'force' is set, we only require the "MAY" flags.
*/
- flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+ vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
i = 0;
do {
- struct vm_area_struct * vma;
+ struct vm_area_struct *vma;
+ unsigned int foll_flags;
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(tsk, start)) {
@@ -945,8 +947,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
continue;
}
- if (!vma || (vma->vm_flags & VM_IO)
- || !(flags & vma->vm_flags))
+ if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
+ || !(vm_flags & vma->vm_flags))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +956,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
&start, &len, i);
continue;
}
- spin_lock(&mm->page_table_lock);
+
+ foll_flags = FOLL_TOUCH;
+ if (pages)
+ foll_flags |= FOLL_GET;
+ if (!write && !(vma->vm_flags & VM_LOCKED) &&
+ (!vma->vm_ops || !vma->vm_ops->nopage))
+ foll_flags |= FOLL_ANON;
+
do {
- int write_access = write;
struct page *page;
- cond_resched_lock(&mm->page_table_lock);
- while (!(page = follow_page(mm, start, write_access))) {
- int ret;
-
- /*
- * Shortcut for anonymous pages. We don't want
- * to force the creation of pages tables for
- * insanely big anonymously mapped areas that
- * nobody touched so far. This is important
- * for doing a core dump for these mappings.
- */
- if (!write && untouched_anonymous_page(mm,vma,start)) {
- page = ZERO_PAGE(start);
- break;
- }
- spin_unlock(&mm->page_table_lock);
- ret = __handle_mm_fault(mm, vma, start, write_access);
+ if (write)
+ foll_flags |= FOLL_WRITE;
+ cond_resched();
+ while (!(page = follow_page(mm, start, foll_flags))) {
+ int ret;
+ ret = __handle_mm_fault(mm, vma, start,
+ foll_flags & FOLL_WRITE);
/*
* The VM_FAULT_WRITE bit tells us that do_wp_page has
* broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +982,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
* subsequent page lookups as if they were reads.
*/
if (ret & VM_FAULT_WRITE)
- write_access = 0;
+ foll_flags &= ~FOLL_WRITE;
switch (ret & ~VM_FAULT_WRITE) {
case VM_FAULT_MINOR:
@@ -1000,13 +998,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
default:
BUG();
}
- spin_lock(&mm->page_table_lock);
}
if (pages) {
pages[i] = page;
flush_dcache_page(page);
- if (!PageReserved(page))
- page_cache_get(page);
}
if (vmas)
vmas[i] = vma;
@@ -1014,7 +1009,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
start += PAGE_SIZE;
len--;
} while (len && start < vma->vm_end);
- spin_unlock(&mm->page_table_lock);
} while (len);
return i;
}
@@ -1024,16 +1018,21 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t prot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
do {
- pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+ struct page *page = ZERO_PAGE(addr);
+ pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+ page_cache_get(page);
+ page_add_file_rmap(page);
+ inc_mm_counter(mm, file_rss);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1083,14 +1082,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = zeromap_pud_range(mm, pgd, addr, next, prot);
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
return err;
}
@@ -1104,17 +1101,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long pfn, pgprot_t prot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
do {
BUG_ON(!pte_none(*pte));
- if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
- set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+ set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1173,8 +1170,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
- * VM_RESERVED tells swapout not to try to touch
- * this region.
+ * VM_RESERVED tells the core MM not to "manage" these pages
+ * (e.g. refcount, mapcount, try to swap them out).
*/
vma->vm_flags |= VM_IO | VM_RESERVED;
@@ -1182,7 +1179,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = remap_pud_range(mm, pgd, addr, next,
@@ -1190,12 +1186,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
/*
+ * handle_pte_fault chooses page fault handler according to an entry
+ * which was read non-atomically. Before making any commitment, on
+ * those architectures or configurations (e.g. i386 with PAE) which
+ * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * must check under lock before unmapping the pte and proceeding
+ * (but do_wp_page is only called after already making such a check;
+ * and do_anonymous_page and do_no_page can safely check later on).
+ */
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
+ pte_t *page_table, pte_t orig_pte)
+{
+ int same = 1;
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+ if (sizeof(pte_t) > sizeof(unsigned long)) {
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ same = pte_same(*page_table, orig_pte);
+ spin_unlock(ptl);
+ }
+#endif
+ pte_unmap(page_table);
+ return same;
+}
+
+/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
* servicing faults for write access. In the normal case, do always want
* pte_mkwrite. But get_user_pages can cause write faults for mappings
@@ -1209,28 +1229,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
}
/*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
- */
-static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
- pte_t *page_table)
-{
- pte_t entry;
-
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
- vma);
- ptep_establish(vma, address, page_table, entry);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
-}
-
-/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
- * Goto-purists beware: the only reason for goto's here is that it results
- * in better assembly code.. The "default" path will see no jumps at all.
- *
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
@@ -1240,28 +1242,28 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
- * We hold the mm semaphore and the page_table_lock on entry and exit
- * with the page_table_lock released.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
{
struct page *old_page, *new_page;
- unsigned long pfn = pte_pfn(pte);
+ unsigned long pfn = pte_pfn(orig_pte);
pte_t entry;
- int ret;
+ int ret = VM_FAULT_MINOR;
+
+ BUG_ON(vma->vm_flags & VM_RESERVED);
if (unlikely(!pfn_valid(pfn))) {
/*
- * This should really halt the system so it can be debugged or
- * at least the kernel stops what it's doing before it corrupts
- * data, but for the moment just pretend this is OOM.
+ * Page table corrupted: show pte and kill process.
*/
- pte_unmap(page_table);
- printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
- address);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_OOM;
+ print_bad_pte(vma, orig_pte, address);
+ ret = VM_FAULT_OOM;
+ goto unlock;
}
old_page = pfn_to_page(pfn);
@@ -1270,52 +1272,51 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
unlock_page(old_page);
if (reuse) {
flush_cache_page(vma, address, pfn);
- entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
- vma);
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
ptep_set_access_flags(vma, address, page_table, entry, 1);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_MINOR|VM_FAULT_WRITE;
+ ret |= VM_FAULT_WRITE;
+ goto unlock;
}
}
- pte_unmap(page_table);
/*
* Ok, we need to copy. Oh, well..
*/
- if (!PageReserved(old_page))
- page_cache_get(old_page);
- spin_unlock(&mm->page_table_lock);
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
if (unlikely(anon_vma_prepare(vma)))
- goto no_new_page;
+ goto oom;
if (old_page == ZERO_PAGE(address)) {
new_page = alloc_zeroed_user_highpage(vma, address);
if (!new_page)
- goto no_new_page;
+ goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!new_page)
- goto no_new_page;
+ goto oom;
copy_user_highpage(new_page, old_page, address);
}
+
/*
* Re-check the pte - we dropped the lock
*/
- ret = VM_FAULT_MINOR;
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
- if (likely(pte_same(*page_table, pte))) {
- if (PageAnon(old_page))
- dec_mm_counter(mm, anon_rss);
- if (PageReserved(old_page))
- inc_mm_counter(mm, rss);
- else
- page_remove_rmap(old_page);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (likely(pte_same(*page_table, orig_pte))) {
+ page_remove_rmap(old_page);
+ if (!PageAnon(old_page)) {
+ inc_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, file_rss);
+ }
flush_cache_page(vma, address, pfn);
- break_cow(vma, new_page, address, page_table);
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ ptep_establish(vma, address, page_table, entry);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
lru_cache_add_active(new_page);
page_add_anon_rmap(new_page, vma, address);
@@ -1323,13 +1324,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
new_page = old_page;
ret |= VM_FAULT_WRITE;
}
- pte_unmap(page_table);
page_cache_release(new_page);
page_cache_release(old_page);
- spin_unlock(&mm->page_table_lock);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
return ret;
-
-no_new_page:
+oom:
page_cache_release(old_page);
return VM_FAULT_OOM;
}
@@ -1399,13 +1399,6 @@ again:
restart_addr = zap_page_range(vma, start_addr,
end_addr - start_addr, details);
-
- /*
- * We cannot rely on the break test in unmap_vmas:
- * on the one hand, we don't want to restart our loop
- * just because that broke out for the page_table_lock;
- * on the other hand, it does no test when vma is small.
- */
need_break = need_resched() ||
need_lockbreak(details->i_mmap_lock);
@@ -1654,38 +1647,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
}
/*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_swap_page(struct mm_struct * mm,
- struct vm_area_struct * vma, unsigned long address,
- pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
{
+ spinlock_t *ptl;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(orig_pte);
+ swp_entry_t entry;
pte_t pte;
int ret = VM_FAULT_MINOR;
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+ goto out;
+
+ entry = pte_to_swp_entry(orig_pte);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte while
- * we released the page table lock.
+ * Back out if somebody else faulted in this pte
+ * while we released the pte lock.
*/
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_MINOR;
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- goto out;
+ goto unlock;
}
/* Had to read the page from swap area: Major fault */
@@ -1698,15 +1690,11 @@ static int do_swap_page(struct mm_struct * mm,
lock_page(page);
/*
- * Back out if somebody else faulted in this pte while we
- * released the page table lock.
+ * Back out if somebody else already faulted in this pte.
*/
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
- if (unlikely(!pte_same(*page_table, orig_pte))) {
- ret = VM_FAULT_MINOR;
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
- }
if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
@@ -1715,7 +1703,7 @@ static int do_swap_page(struct mm_struct * mm,
/* The page isn't present yet, go ahead with the fault. */
- inc_mm_counter(mm, rss);
+ inc_mm_counter(mm, anon_rss);
pte = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1733,7 +1721,7 @@ static int do_swap_page(struct mm_struct * mm,
if (write_access) {
if (do_wp_page(mm, vma, address,
- page_table, pmd, pte) == VM_FAULT_OOM)
+ page_table, pmd, ptl, pte) == VM_FAULT_OOM)
ret = VM_FAULT_OOM;
goto out;
}
@@ -1741,74 +1729,76 @@ static int do_swap_page(struct mm_struct * mm,
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
lazy_mmu_prot_update(pte);
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
out:
return ret;
out_nomap:
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(page_table, ptl);
unlock_page(page);
page_cache_release(page);
- goto out;
+ return ret;
}
/*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int
-do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr)
+static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
{
+ struct page *page;
+ spinlock_t *ptl;
pte_t entry;
- struct page * page = ZERO_PAGE(addr);
-
- /* Read-only mapping of ZERO_PAGE. */
- entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
- /* ..except if it's a write access */
if (write_access) {
/* Allocate our own private page. */
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
if (unlikely(anon_vma_prepare(vma)))
- goto no_mem;
- page = alloc_zeroed_user_highpage(vma, addr);
+ goto oom;
+ page = alloc_zeroed_user_highpage(vma, address);
if (!page)
- goto no_mem;
+ goto oom;
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, addr);
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pte_none(*page_table)) {
- pte_unmap(page_table);
- page_cache_release(page);
- spin_unlock(&mm->page_table_lock);
- goto out;
- }
- inc_mm_counter(mm, rss);
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
- vma->vm_page_prot)),
- vma);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_none(*page_table))
+ goto release;
+ inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
SetPageReferenced(page);
- page_add_anon_rmap(page, vma, addr);
+ page_add_anon_rmap(page, vma, address);
+ } else {
+ /* Map the ZERO_PAGE - vm_page_prot is readonly */
+ page = ZERO_PAGE(address);
+ page_cache_get(page);
+ entry = mk_pte(page, vma->vm_page_prot);
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (!pte_none(*page_table))
+ goto release;
+ inc_mm_counter(mm, file_rss);
+ page_add_file_rmap(page);
}
- set_pte_at(mm, addr, page_table, entry);
- pte_unmap(page_table);
+ set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, addr, entry);
+ update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- spin_unlock(&mm->page_table_lock);
-out:
+unlock:
+ pte_unmap_unlock(page_table, ptl);
return VM_FAULT_MINOR;
-no_mem:
+release:
+ page_cache_release(page);
+ goto unlock;
+oom:
return VM_FAULT_OOM;
}
@@ -1821,25 +1811,23 @@ no_mem:
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore held and the page table
- * spinlock held. Exit with the spinlock released.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int
-do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
{
- struct page * new_page;
+ spinlock_t *ptl;
+ struct page *new_page;
struct address_space *mapping = NULL;
pte_t entry;
unsigned int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
@@ -1847,7 +1835,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
smp_rmb(); /* serializes i_size against truncate_count */
}
retry:
- cond_resched();
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
/*
* No smp_rmb is needed here as long as there's a full
@@ -1880,19 +1867,20 @@ retry:
anon = 1;
}
- spin_lock(&mm->page_table_lock);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* For a file-backed vma, someone could have truncated or otherwise
* invalidated this page. If unmap_mapping_range got called,
* retry getting the page.
*/
if (mapping && unlikely(sequence != mapping->truncate_count)) {
- sequence = mapping->truncate_count;
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(page_table, ptl);
page_cache_release(new_page);
+ cond_resched();
+ sequence = mapping->truncate_count;
+ smp_rmb();
goto retry;
}
- page_table = pte_offset_map(pmd, address);
/*
* This silly early PAGE_DIRTY setting removes a race
@@ -1906,68 +1894,67 @@ retry:
*/
/* Only go through if we didn't race with anybody else... */
if (pte_none(*page_table)) {
- if (!PageReserved(new_page))
- inc_mm_counter(mm, rss);
-
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
if (anon) {
+ inc_mm_counter(mm, anon_rss);
lru_cache_add_active(new_page);
page_add_anon_rmap(new_page, vma, address);
- } else
+ } else if (!(vma->vm_flags & VM_RESERVED)) {
+ inc_mm_counter(mm, file_rss);
page_add_file_rmap(new_page);
- pte_unmap(page_table);
+ }
} else {
/* One of our sibling threads was faster, back out. */
- pte_unmap(page_table);
page_cache_release(new_page);
- spin_unlock(&mm->page_table_lock);
- goto out;
+ goto unlock;
}
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- spin_unlock(&mm->page_table_lock);
-out:
+unlock:
+ pte_unmap_unlock(page_table, ptl);
return ret;
oom:
page_cache_release(new_page);
- ret = VM_FAULT_OOM;
- goto out;
+ return VM_FAULT_OOM;
}
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
{
- unsigned long pgoff;
+ pgoff_t pgoff;
int err;
- BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
- /*
- * Fall back to the linear mapping if the fs does not support
- * ->populate:
- */
- if (!vma->vm_ops->populate ||
- (write_access && !(vma->vm_flags & VM_SHARED))) {
- pte_clear(mm, address, pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd);
- }
-
- pgoff = pte_to_pgoff(*pte);
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+ return VM_FAULT_MINOR;
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+ /*
+ * Page table corrupted: show pte and kill process.
+ */
+ print_bad_pte(vma, orig_pte, address);
+ return VM_FAULT_OOM;
+ }
+ /* We can then assume vm->vm_ops && vma->vm_ops->populate */
- err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
+ pgoff = pte_to_pgoff(orig_pte);
+ err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
+ vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
return VM_FAULT_OOM;
if (err)
@@ -1984,56 +1971,68 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct * vma, unsigned long address,
- int write_access, pte_t *pte, pmd_t *pmd)
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, int write_access)
{
pte_t entry;
+ pte_t old_entry;
+ spinlock_t *ptl;
- entry = *pte;
+ old_entry = entry = *pte;
if (!pte_present(entry)) {
- /*
- * If it truly wasn't present, we know that kswapd
- * and the PTE updates will not touch it later. So
- * drop the lock.
- */
- if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ if (pte_none(entry)) {
+ if (!vma->vm_ops || !vma->vm_ops->nopage)
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
+ return do_no_page(mm, vma, address,
+ pte, pmd, write_access);
+ }
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access, pte, pmd);
- return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+ return do_file_page(mm, vma, address,
+ pte, pmd, write_access, entry);
+ return do_swap_page(mm, vma, address,
+ pte, pmd, write_access, entry);
}
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*pte, entry)))
+ goto unlock;
if (write_access) {
if (!pte_write(entry))
- return do_wp_page(mm, vma, address, pte, pmd, entry);
+ return do_wp_page(mm, vma, address,
+ pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- ptep_set_access_flags(vma, address, pte, entry, write_access);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ if (!pte_same(old_entry, entry)) {
+ ptep_set_access_flags(vma, address, pte, entry, write_access);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+ } else {
+ /*
+ * This is needed only for protection faults but the arch code
+ * is not yet telling us if this is a protection fault or not.
+ * This still avoids useless tlb flushes for .text page faults
+ * with threads.
+ */
+ if (write_access)
+ flush_tlb_page(vma, address);
+ }
+unlock:
+ pte_unmap_unlock(pte, ptl);
return VM_FAULT_MINOR;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
pgd_t *pgd;
@@ -2048,100 +2047,66 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, write_access);
- /*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
- */
pgd = pgd_offset(mm, address);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, address);
if (!pud)
- goto oom;
-
+ return VM_FAULT_OOM;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
- goto oom;
-
+ return VM_FAULT_OOM;
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
- goto oom;
-
- return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+ return VM_FAULT_OOM;
- oom:
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_OOM;
+ return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
}
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*/
-pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
- pud_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pud_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ pud_t *new = pud_alloc_one(mm, address);
if (!new)
- return NULL;
+ return -ENOMEM;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pgd_present(*pgd)) {
+ spin_lock(&mm->page_table_lock);
+ if (pgd_present(*pgd)) /* Another has populated it */
pud_free(new);
- goto out;
- }
- pgd_populate(mm, pgd, new);
- out:
- return pud_offset(pgd, address);
+ else
+ pgd_populate(mm, pgd, new);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*/
-pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
- pmd_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pmd_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
- return NULL;
+ return -ENOMEM;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
+ spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
- if (pud_present(*pud)) {
+ if (pud_present(*pud)) /* Another has populated it */
pmd_free(new);
- goto out;
- }
- pud_populate(mm, pud, new);
+ else
+ pud_populate(mm, pud, new);
#else
- if (pgd_present(*pud)) {
+ if (pgd_present(*pud)) /* Another has populated it */
pmd_free(new);
- goto out;
- }
- pgd_populate(mm, pud, new);
+ else
+ pgd_populate(mm, pud, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
-
- out:
- return pmd_offset(pud, address);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */
@@ -2206,22 +2171,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
EXPORT_SYMBOL(vmalloc_to_pfn);
-/*
- * update_mem_hiwater
- * - update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
- if (tsk->mm) {
- unsigned long rss = get_mm_counter(tsk->mm, rss);
-
- if (tsk->mm->hiwater_rss < rss)
- tsk->mm->hiwater_rss = rss;
- if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
- tsk->mm->hiwater_vm = tsk->mm->total_vm;
- }
-}
-
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
@@ -2233,7 +2182,7 @@ static int __init gate_vma_init(void)
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
gate_vma.vm_page_prot = PAGE_READONLY;
- gate_vma.vm_flags = 0;
+ gate_vma.vm_flags = VM_RESERVED;
return 0;
}
__initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
new file mode 100644
index 000000000000..431a64f021c0
--- /dev/null
+++ b/mm/memory_hotplug.c
@@ -0,0 +1,138 @@
+/*
+ * linux/mm/memory_hotplug.c
+ *
+ * Copyright (C)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+
+extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+ unsigned long size);
+static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nr_pages = PAGES_PER_SECTION;
+ int nid = pgdat->node_id;
+ int zone_type;
+
+ zone_type = zone - pgdat->node_zones;
+ memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
+ zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+}
+
+extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+ int nr_pages);
+static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nr_pages = PAGES_PER_SECTION;
+ int ret;
+
+ ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
+
+ if (ret < 0)
+ return ret;
+
+ __add_zone(zone, phys_start_pfn);
+ return register_new_memory(__pfn_to_section(phys_start_pfn));
+}
+
+/*
+ * Reasonably generic function for adding memory. It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ int err = 0;
+
+ for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
+ err = __add_section(zone, phys_start_pfn + i);
+
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static void grow_zone_span(struct zone *zone,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long old_zone_end_pfn;
+
+ zone_span_writelock(zone);
+
+ old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ if (start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ if (end_pfn > old_zone_end_pfn)
+ zone->spanned_pages = end_pfn - zone->zone_start_pfn;
+
+ zone_span_writeunlock(zone);
+}
+
+static void grow_pgdat_span(struct pglist_data *pgdat,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long old_pgdat_end_pfn =
+ pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ if (end_pfn > old_pgdat_end_pfn)
+ pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
+}
+
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long i;
+ unsigned long flags;
+ unsigned long onlined_pages = 0;
+ struct zone *zone;
+
+ /*
+ * This doesn't need a lock to do pfn_to_page().
+ * The section can't be removed here because of the
+ * memory_block->state_sem.
+ */
+ zone = page_zone(pfn_to_page(pfn));
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ grow_zone_span(zone, pfn, pfn + nr_pages);
+ grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pfn_to_page(pfn + i);
+ online_page(page);
+ onlined_pages++;
+ }
+ zone->present_pages += onlined_pages;
+
+ setup_per_zone_pages_min();
+
+ return 0;
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 37af443eb094..5abc57c2b8bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
* Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
+ *
* bind Only allocate memory on a specific set of nodes,
* no fallback.
+ * FIXME: memory is allocated starting with the first node
+ * to the last. It would be better if bind would truly restrict
+ * the allocation to memory nodes instead
+ *
* preferred Try a specific node first before normal fallback.
* As a special case node -1 here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -93,23 +100,10 @@ struct mempolicy default_policy = {
.policy = MPOL_DEFAULT,
};
-/* Check if all specified nodes are online */
-static int nodes_online(unsigned long *nodes)
-{
- DECLARE_BITMAP(online2, MAX_NUMNODES);
-
- bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
- if (bitmap_empty(online2, MAX_NUMNODES))
- set_bit(0, online2);
- if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
- return -EINVAL;
- return 0;
-}
-
/* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, unsigned long *nodes)
+static int mpol_check_policy(int mode, nodemask_t *nodes)
{
- int empty = bitmap_empty(nodes, MAX_NUMNODES);
+ int empty = nodes_empty(*nodes);
switch (mode) {
case MPOL_DEFAULT:
@@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes)
return -EINVAL;
break;
}
- return nodes_online(nodes);
-}
-
-/* Copy a node mask from user space. */
-static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
- unsigned long maxnode, int mode)
-{
- unsigned long k;
- unsigned long nlongs;
- unsigned long endmask;
-
- --maxnode;
- bitmap_zero(nodes, MAX_NUMNODES);
- if (maxnode == 0 || !nmask)
- return 0;
-
- nlongs = BITS_TO_LONGS(maxnode);
- if ((maxnode % BITS_PER_LONG) == 0)
- endmask = ~0UL;
- else
- endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
-
- /* When the user specified more nodes than supported just check
- if the non supported part is all zero. */
- if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
- if (nlongs > PAGE_SIZE/sizeof(long))
- return -EINVAL;
- for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
- unsigned long t;
- if (get_user(t, nmask + k))
- return -EFAULT;
- if (k == nlongs - 1) {
- if (t & endmask)
- return -EINVAL;
- } else if (t)
- return -EINVAL;
- }
- nlongs = BITS_TO_LONGS(MAX_NUMNODES);
- endmask = ~0UL;
- }
-
- if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
- return -EFAULT;
- nodes[nlongs-1] &= endmask;
- /* Update current mems_allowed */
- cpuset_update_current_mems_allowed();
- /* Ignore nodes not set in current->mems_allowed */
- cpuset_restrict_to_mems_allowed(nodes);
- return mpol_check_policy(mode, nodes);
+ return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
-
/* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(unsigned long *nodes)
+static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
int num, max, nd;
- max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
+ max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
if (!zl)
return NULL;
num = 0;
- for (nd = find_first_bit(nodes, MAX_NUMNODES);
- nd < MAX_NUMNODES;
- nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
+ for_each_node_mask(nd, *nodes) {
int k;
for (k = MAX_NR_ZONES-1; k >= 0; k--) {
struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes)
policy_zone = k;
}
}
- BUG_ON(num >= max);
zl->zones[num] = NULL;
return zl;
}
/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
+static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
{
struct mempolicy *policy;
- PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
+ PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
if (mode == MPOL_DEFAULT)
return NULL;
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
atomic_set(&policy->refcnt, 1);
switch (mode) {
case MPOL_INTERLEAVE:
- bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
+ policy->v.nodes = *nodes;
break;
case MPOL_PREFERRED:
- policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
+ policy->v.preferred_node = first_node(*nodes);
if (policy->v.preferred_node >= MAX_NUMNODES)
policy->v.preferred_node = -1;
break;
@@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
}
/* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end, nodemask_t *nodes)
{
pte_t *orig_pte;
pte_t *pte;
+ spinlock_t *ptl;
- spin_lock(&mm->page_table_lock);
- orig_pte = pte = pte_offset_map(pmd, addr);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
unsigned long pfn;
unsigned int nid;
@@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (!pte_present(*pte))
continue;
pfn = pte_pfn(*pte);
- if (!pfn_valid(pfn))
+ if (!pfn_valid(pfn)) {
+ print_bad_pte(vma, *pte, addr);
continue;
+ }
nid = pfn_to_nid(pfn);
- if (!test_bit(nid, nodes))
+ if (!node_isset(nid, *nodes))
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(orig_pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(orig_pte, ptl);
return addr != end;
}
-static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end, nodemask_t *nodes)
{
pmd_t *pmd;
unsigned long next;
@@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- if (check_pte_range(mm, pmd, addr, next, nodes))
+ if (check_pte_range(vma, pmd, addr, next, nodes))
return -EIO;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end, nodemask_t *nodes)
{
pud_t *pud;
unsigned long next;
@@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- if (check_pmd_range(mm, pud, addr, next, nodes))
+ if (check_pmd_range(vma, pud, addr, next, nodes))
return -EIO;
} while (pud++, addr = next, addr != end);
return 0;
}
-static inline int check_pgd_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pgd_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end, nodemask_t *nodes)
{
pgd_t *pgd;
unsigned long next;
- pgd = pgd_offset(mm, addr);
+ pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (check_pud_range(mm, pgd, addr, next, nodes))
+ if (check_pud_range(vma, pgd, addr, next, nodes))
return -EIO;
} while (pgd++, addr = next, addr != end);
return 0;
@@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm,
/* Step 1: check the range */
static struct vm_area_struct *
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- unsigned long *nodes, unsigned long flags)
+ nodemask_t *nodes, unsigned long flags)
{
int err;
struct vm_area_struct *first, *vma, *prev;
@@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
first = find_vma(mm, start);
if (!first)
return ERR_PTR(-EFAULT);
+ if (first->vm_flags & VM_RESERVED)
+ return ERR_PTR(-EACCES);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
if (!vma->vm_next && vma->vm_end < end)
@@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
- err = check_pgd_range(vma->vm_mm,
- start, endvma, nodes);
+ err = check_pgd_range(vma, start, endvma, nodes);
if (err) {
first = ERR_PTR(err);
break;
@@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
return err;
}
-/* Change policy for a memory range */
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
- unsigned long mode,
- unsigned long __user *nmask, unsigned long maxnode,
- unsigned flags)
+static int contextualize_policy(int mode, nodemask_t *nodes)
+{
+ if (!nodes)
+ return 0;
+
+ /* Update current mems_allowed */
+ cpuset_update_current_mems_allowed();
+ /* Ignore nodes not set in current->mems_allowed */
+ cpuset_restrict_to_mems_allowed(nodes->bits);
+ return mpol_check_policy(mode, nodes);
+}
+
+long do_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, nodemask_t *nmask, unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
- DECLARE_BITMAP(nodes, MAX_NUMNODES);
int err;
if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
return -EINVAL;
if (end == start)
return 0;
-
- err = get_nodes(nodes, nmask, maxnode, mode);
- if (err)
- return err;
-
- new = mpol_new(mode, nodes);
+ if (mpol_check_policy(mode, nmask))
+ return -EINVAL;
+ new = mpol_new(mode, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes[0]);
+ mode,nodes_addr(nodes)[0]);
down_write(&mm->mmap_sem);
- vma = check_range(mm, start, end, nodes, flags);
+ vma = check_range(mm, start, end, nmask, flags);
err = PTR_ERR(vma);
if (!IS_ERR(vma))
err = mbind_range(vma, start, end, new);
@@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
}
/* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
- unsigned long maxnode)
+long do_set_mempolicy(int mode, nodemask_t *nodes)
{
- int err;
struct mempolicy *new;
- DECLARE_BITMAP(nodes, MAX_NUMNODES);
- if (mode < 0 || mode > MPOL_MAX)
+ if (contextualize_policy(mode, nodes))
return -EINVAL;
- err = get_nodes(nodes, nmask, maxnode, mode);
- if (err)
- return err;
new = mpol_new(mode, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
if (new && new->policy == MPOL_INTERLEAVE)
- current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
+ current->il_next = first_node(new->v.nodes);
return 0;
}
/* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
+static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
{
int i;
- bitmap_zero(nodes, MAX_NUMNODES);
+ nodes_clear(*nodes);
switch (p->policy) {
case MPOL_BIND:
for (i = 0; p->v.zonelist->zones[i]; i++)
- __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
+ node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+ *nodes);
break;
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
- bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
+ *nodes = p->v.nodes;
break;
case MPOL_PREFERRED:
/* or use current node instead of online map? */
if (p->v.preferred_node < 0)
- bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
+ *nodes = node_online_map;
else
- __set_bit(p->v.preferred_node, nodes);
+ node_set(p->v.preferred_node, *nodes);
break;
default:
BUG();
@@ -504,37 +448,18 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
return err;
}
-/* Copy a kernel node mask to user space */
-static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
- void *nodes, unsigned nbytes)
-{
- unsigned long copy = ALIGN(maxnode-1, 64) / 8;
-
- if (copy > nbytes) {
- if (copy > PAGE_SIZE)
- return -EINVAL;
- if (clear_user((char __user *)mask + nbytes, copy - nbytes))
- return -EFAULT;
- copy = nbytes;
- }
- return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
-}
-
/* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
- unsigned long __user *nmask,
- unsigned long maxnode,
- unsigned long addr, unsigned long flags)
+long do_get_mempolicy(int *policy, nodemask_t *nmask,
+ unsigned long addr, unsigned long flags)
{
- int err, pval;
+ int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
+ cpuset_update_current_mems_allowed();
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
- return -EINVAL;
if (flags & MPOL_F_ADDR) {
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
@@ -557,31 +482,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
err = lookup_node(mm, addr);
if (err < 0)
goto out;
- pval = err;
+ *policy = err;
} else if (pol == current->mempolicy &&
pol->policy == MPOL_INTERLEAVE) {
- pval = current->il_next;
+ *policy = current->il_next;
} else {
err = -EINVAL;
goto out;
}
} else
- pval = pol->policy;
+ *policy = pol->policy;
if (vma) {
up_read(&current->mm->mmap_sem);
vma = NULL;
}
- if (policy && put_user(pval, policy))
- return -EFAULT;
-
err = 0;
- if (nmask) {
- DECLARE_BITMAP(nodes, MAX_NUMNODES);
- get_zonemask(pol, nodes);
- err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
- }
+ if (nmask)
+ get_zonemask(pol, nmask);
out:
if (vma)
@@ -589,6 +508,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
return err;
}
+/*
+ * User space interface with variable sized bitmaps for nodelists.
+ */
+
+/* Copy a node mask from user space. */
+static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned long k;
+ unsigned long nlongs;
+ unsigned long endmask;
+
+ --maxnode;
+ nodes_clear(*nodes);
+ if (maxnode == 0 || !nmask)
+ return 0;
+
+ nlongs = BITS_TO_LONGS(maxnode);
+ if ((maxnode % BITS_PER_LONG) == 0)
+ endmask = ~0UL;
+ else
+ endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+ /* When the user specified more nodes than supported just check
+ if the non supported part is all zero. */
+ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+ if (nlongs > PAGE_SIZE/sizeof(long))
+ return -EINVAL;
+ for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+ unsigned long t;
+ if (get_user(t, nmask + k))
+ return -EFAULT;
+ if (k == nlongs - 1) {
+ if (t & endmask)
+ return -EINVAL;
+ } else if (t)
+ return -EINVAL;
+ }
+ nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+ endmask = ~0UL;
+ }
+
+ if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
+ return -EFAULT;
+ nodes_addr(*nodes)[nlongs-1] &= endmask;
+ return 0;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+ nodemask_t *nodes)
+{
+ unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+ const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+
+ if (copy > nbytes) {
+ if (copy > PAGE_SIZE)
+ return -EINVAL;
+ if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+ return -EFAULT;
+ copy = nbytes;
+ }
+ return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+ unsigned long mode,
+ unsigned long __user *nmask, unsigned long maxnode,
+ unsigned flags)
+{
+ nodemask_t nodes;
+ int err;
+
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_mbind(start, len, mode, &nodes, flags);
+}
+
+/* Set the process memory policy */
+asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ int err;
+ nodemask_t nodes;
+
+ if (mode < 0 || mode > MPOL_MAX)
+ return -EINVAL;
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_set_mempolicy(mode, &nodes);
+}
+
+/* Retrieve NUMA policy */
+asmlinkage long sys_get_mempolicy(int __user *policy,
+ unsigned long __user *nmask,
+ unsigned long maxnode,
+ unsigned long addr, unsigned long flags)
+{
+ int err, pval;
+ nodemask_t nodes;
+
+ if (nmask != NULL && maxnode < MAX_NUMNODES)
+ return -EINVAL;
+
+ err = do_get_mempolicy(&pval, &nodes, addr, flags);
+
+ if (err)
+ return err;
+
+ if (policy && put_user(pval, policy))
+ return -EFAULT;
+
+ if (nmask)
+ err = copy_nodes_to_user(nmask, maxnode, &nodes);
+
+ return err;
+}
+
#ifdef CONFIG_COMPAT
asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -649,15 +688,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
long err = 0;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
- DECLARE_BITMAP(bm, MAX_NUMNODES);
+ nodemask_t bm;
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
- err = compat_get_bitmap(bm, nmask, nr_bits);
+ err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, bm, alloc_size);
+ err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
}
if (err)
@@ -676,7 +715,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy)
- pol = vma->vm_ops->get_policy(vma, addr);
+ pol = vma->vm_ops->get_policy(vma, addr);
else if (vma->vm_policy &&
vma->vm_policy->policy != MPOL_DEFAULT)
pol = vma->vm_policy;
@@ -700,7 +739,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
case MPOL_BIND:
/* Lower zones don't get a policy applied */
/* Careful: current->mems_allowed might have moved */
- if ((gfp & GFP_ZONEMASK) >= policy_zone)
+ if (gfp_zone(gfp) >= policy_zone)
if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
return policy->v.zonelist;
/*FALL THROUGH*/
@@ -712,7 +751,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
nd = 0;
BUG();
}
- return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
+ return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
}
/* Do dynamic interleaving for a process */
@@ -722,10 +761,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
struct task_struct *me = current;
nid = me->il_next;
- BUG_ON(nid >= MAX_NUMNODES);
- next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
+ next = next_node(nid, policy->v.nodes);
if (next >= MAX_NUMNODES)
- next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
+ next = first_node(policy->v.nodes);
me->il_next = next;
return nid;
}
@@ -734,30 +772,28 @@ static unsigned interleave_nodes(struct mempolicy *policy)
static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
{
- unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
+ unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target = (unsigned)off % nnodes;
int c;
int nid = -1;
c = 0;
do {
- nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
+ nid = next_node(nid, pol->v.nodes);
c++;
} while (c <= target);
- BUG_ON(nid >= MAX_NUMNODES);
- BUG_ON(!test_bit(nid, pol->v.nodes));
return nid;
}
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+ unsigned nid)
{
struct zonelist *zl;
struct page *page;
- BUG_ON(!node_online(nid));
- zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
+ zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zl->zones[0]) {
zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
@@ -799,8 +835,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
unsigned nid;
if (vma) {
unsigned long off;
- BUG_ON(addr >= vma->vm_end);
- BUG_ON(addr < vma->vm_start);
off = vma->vm_pgoff;
off += (addr - vma->vm_start) >> PAGE_SHIFT;
nid = offset_il_node(pol, vma, off);
@@ -878,7 +912,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_DEFAULT:
return 1;
case MPOL_INTERLEAVE:
- return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
+ return nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
return a->v.preferred_node == b->v.preferred_node;
case MPOL_BIND: {
@@ -1117,7 +1151,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
vma->vm_pgoff,
sz, npol? npol->policy : -1,
- npol ? npol->v.nodes[0] : -1);
+ npol ? nodes_addr(npol->v.nodes)[0] : -1);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1164,14 +1198,75 @@ void __init numa_policy_init(void)
/* Set interleaving policy for system init. This way not all
the data structures allocated at system boot end up in node zero. */
- if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
- MAX_NUMNODES) < 0)
+ if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
printk("numa_policy_init: interleaving failed\n");
}
-/* Reset policy of current process to default.
- * Assumes fs == KERNEL_DS */
+/* Reset policy of current process to default */
void numa_default_policy(void)
{
- sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
+ do_set_mempolicy(MPOL_DEFAULT, NULL);
+}
+
+/* Migrate a policy to a different set of nodes */
+static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
+ const nodemask_t *new)
+{
+ nodemask_t tmp;
+
+ if (!pol)
+ return;
+
+ switch (pol->policy) {
+ case MPOL_DEFAULT:
+ break;
+ case MPOL_INTERLEAVE:
+ nodes_remap(tmp, pol->v.nodes, *old, *new);
+ pol->v.nodes = tmp;
+ current->il_next = node_remap(current->il_next, *old, *new);
+ break;
+ case MPOL_PREFERRED:
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ *old, *new);
+ break;
+ case MPOL_BIND: {
+ nodemask_t nodes;
+ struct zone **z;
+ struct zonelist *zonelist;
+
+ nodes_clear(nodes);
+ for (z = pol->v.zonelist->zones; *z; z++)
+ node_set((*z)->zone_pgdat->node_id, nodes);
+ nodes_remap(tmp, nodes, *old, *new);
+ nodes = tmp;
+
+ zonelist = bind_zonelist(&nodes);
+
+ /* If no mem, then zonelist is NULL and we keep old zonelist.
+ * If that old zonelist has no remaining mems_allowed nodes,
+ * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
+ */
+
+ if (zonelist) {
+ /* Good - got mem - substitute new zonelist */
+ kfree(pol->v.zonelist);
+ pol->v.zonelist = zonelist;
+ }
+ break;
+ }
+ default:
+ BUG();
+ break;
+ }
+}
+
+/*
+ * Someone moved this task to different nodes. Fixup mempolicies.
+ *
+ * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
+ * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ */
+void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+{
+ rebind_policy(current->mempolicy, old, new);
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 9e377ea700b2..1a99b80480d3 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -205,7 +205,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
void *element;
unsigned long flags;
wait_queue_t wait;
- unsigned int gfp_temp;
+ gfp_t gfp_temp;
might_sleep_if(gfp_mask & __GFP_WAIT);
diff --git a/mm/mmap.c b/mm/mmap.c
index fa11d91242e8..320dda1778c3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -181,26 +181,36 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
}
/*
- * Remove one vm structure and free it.
+ * Unlink a file-based vm structure from its prio_tree, to hide
+ * vma from rmap and vmtruncate before freeing its page tables.
*/
-static void remove_vm_struct(struct vm_area_struct *vma)
+void unlink_file_vma(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
- might_sleep();
if (file) {
struct address_space *mapping = file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
__remove_shared_vm_struct(vma, file, mapping);
spin_unlock(&mapping->i_mmap_lock);
}
+}
+
+/*
+ * Close a vm structure and free it, returning the next.
+ */
+static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+{
+ struct vm_area_struct *next = vma->vm_next;
+
+ might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (file)
- fput(file);
- anon_vma_unlink(vma);
+ if (vma->vm_file)
+ fput(vma->vm_file);
mpol_free(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
+ return next;
}
asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -832,7 +842,7 @@ none:
}
#ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
+void vm_stat_account(struct mm_struct *mm, unsigned long flags,
struct file *file, long pages)
{
const unsigned long stack_flags
@@ -1070,6 +1080,17 @@ munmap_back:
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
+ if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+ == (VM_WRITE | VM_RESERVED)) {
+ printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+ "PROT_WRITE mmap of VM_RESERVED memory, which "
+ "is deprecated. Please report this to "
+ "linux-kernel@vger.kernel.org\n",current->comm);
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ error = -EACCES;
+ goto unmap_and_free_vma;
+ }
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
@@ -1110,7 +1131,7 @@ munmap_back:
}
out:
mm->total_vm += len >> PAGE_SHIFT;
- __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
@@ -1475,15 +1496,19 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
mm->total_vm += grow;
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
- __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
return 0;
}
-#ifdef CONFIG_STACK_GROWSUP
+#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
/*
- * vma is the first one with address > vma->vm_end. Have to extend vma.
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-int expand_stack(struct vm_area_struct * vma, unsigned long address)
+#ifdef CONFIG_STACK_GROWSUP
+static inline
+#endif
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
int error;
@@ -1521,6 +1546,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
anon_vma_unlock(vma);
return error;
}
+#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_upwards(vma, address);
+}
struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
@@ -1603,36 +1635,24 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
}
#endif
-/* Normal function to fix up a mapping
- * This function is the default for when an area has no specific
- * function. This may be used as part of a more specific routine.
- *
- * By the time this function is called, the area struct has been
- * removed from the process mapping list.
- */
-static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
-{
- size_t len = area->vm_end - area->vm_start;
-
- area->vm_mm->total_vm -= len >> PAGE_SHIFT;
- if (area->vm_flags & VM_LOCKED)
- area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
- vm_stat_unaccount(area);
- remove_vm_struct(area);
-}
-
/*
- * Update the VMA and inode share lists.
- *
- * Ok - we have the memory areas we should free on the 'free' list,
+ * Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
+ *
+ * Called with the mm semaphore held.
*/
-static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
+ /* Update high watermark before we lower total_vm */
+ update_hiwater_vm(mm);
do {
- struct vm_area_struct *next = vma->vm_next;
- unmap_vma(mm, vma);
- vma = next;
+ long nrpages = vma_pages(vma);
+
+ mm->total_vm -= nrpages;
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm -= nrpages;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+ vma = remove_vma(vma);
} while (vma);
validate_mm(mm);
}
@@ -1651,14 +1671,13 @@ static void unmap_region(struct mm_struct *mm,
unsigned long nr_accounted = 0;
lru_add_drain();
- spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
- unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
+ update_hiwater_rss(mm);
+ unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
- spin_unlock(&mm->page_table_lock);
}
/*
@@ -1799,7 +1818,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
unmap_region(mm, vma, prev, start, end);
/* Fix up all other VM information */
- unmap_vma_list(mm, vma);
+ remove_vma_list(mm, vma);
return 0;
}
@@ -1821,7 +1840,7 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
static inline void verify_mm_writelocked(struct mm_struct *mm)
{
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_VM
if (unlikely(down_read_trylock(&mm->mmap_sem))) {
WARN_ON(1);
up_read(&mm->mmap_sem);
@@ -1933,34 +1952,21 @@ void exit_mmap(struct mm_struct *mm)
unsigned long end;
lru_add_drain();
-
- spin_lock(&mm->page_table_lock);
-
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
+ /* Don't update_hiwater_rss(mm) here, do_exit already did */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
+ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
- mm->mmap = mm->mmap_cache = NULL;
- mm->mm_rb = RB_ROOT;
- set_mm_counter(mm, rss, 0);
- mm->total_vm = 0;
- mm->locked_vm = 0;
-
- spin_unlock(&mm->page_table_lock);
-
/*
- * Walk the list again, actually closing and freeing it
- * without holding any MM locks.
+ * Walk the list again, actually closing and freeing it,
+ * with preemption enabled, without holding any MM locks.
*/
- while (vma) {
- struct vm_area_struct *next = vma->vm_next;
- remove_vm_struct(vma);
- vma = next;
- }
+ while (vma)
+ vma = remove_vma(vma);
BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 57577f63b305..17a2b52b753b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
do {
if (pte_present(*pte)) {
pte_t ptent;
@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
lazy_mmu_prot_update(ptent);
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
}
static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma,
change_pud_range(mm, pgd, addr, next, newprot);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
- spin_unlock(&mm->page_table_lock);
}
static int
@@ -125,6 +124,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
* a MAP_NORESERVE private mapping to writable will now reserve.
*/
if (newflags & VM_WRITE) {
+ if (oldflags & VM_RESERVED) {
+ BUG_ON(oldflags & VM_WRITE);
+ printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+ "PROT_WRITE mprotect of VM_RESERVED memory, "
+ "which is deprecated. Please report this to "
+ "linux-kernel@vger.kernel.org\n",current->comm);
+ return -EACCES;
+ }
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
charged = nrpages;
if (security_vm_enough_memory(charged))
@@ -168,8 +175,8 @@ success:
vma->vm_flags = newflags;
vma->vm_page_prot = newprot;
change_protection(vma, start, end, newprot);
- __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
- __vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+ vm_stat_account(mm, newflags, vma->vm_file, nrpages);
return 0;
fail:
diff --git a/mm/mremap.c b/mm/mremap.c
index f343fc73a8bd..b535438c363c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,35 +22,7 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte = NULL;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_none_or_clear_bad(pgd))
- goto end;
-
- pud = pud_offset(pgd, addr);
- if (pud_none_or_clear_bad(pud))
- goto end;
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none_or_clear_bad(pmd))
- goto end;
-
- pte = pte_offset_map_nested(pmd, addr);
- if (pte_none(*pte)) {
- pte_unmap_nested(pte);
- pte = NULL;
- }
-end:
- return pte;
-}
-
-static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
@@ -68,35 +40,39 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
if (pmd_none_or_clear_bad(pmd))
return NULL;
- return pte_offset_map(pmd, addr);
+ return pmd;
}
-static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
return NULL;
+
pmd = pmd_alloc(mm, pud, addr);
- if (pmd)
- pte = pte_alloc_map(mm, pmd, addr);
- return pte;
+ if (!pmd)
+ return NULL;
+
+ if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+ return NULL;
+
+ return pmd;
}
-static int
-move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
- struct vm_area_struct *new_vma, unsigned long new_addr)
+static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+ unsigned long old_addr, unsigned long old_end,
+ struct vm_area_struct *new_vma, pmd_t *new_pmd,
+ unsigned long new_addr)
{
struct address_space *mapping = NULL;
struct mm_struct *mm = vma->vm_mm;
- int error = 0;
- pte_t *src, *dst;
+ pte_t *old_pte, *new_pte, pte;
+ spinlock_t *old_ptl, *new_ptl;
if (vma->vm_file) {
/*
@@ -111,74 +87,69 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
new_vma->vm_truncate_count != vma->vm_truncate_count)
new_vma->vm_truncate_count = 0;
}
- spin_lock(&mm->page_table_lock);
- src = get_one_pte_map_nested(mm, old_addr);
- if (src) {
- /*
- * Look to see whether alloc_one_pte_map needs to perform a
- * memory allocation. If it does then we need to drop the
- * atomic kmap
- */
- dst = get_one_pte_map(mm, new_addr);
- if (unlikely(!dst)) {
- pte_unmap_nested(src);
- if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
- dst = alloc_one_pte_map(mm, new_addr);
- if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
- spin_unlock(&mm->page_table_lock);
- spin_lock(&mapping->i_mmap_lock);
- spin_lock(&mm->page_table_lock);
- }
- src = get_one_pte_map_nested(mm, old_addr);
- }
- /*
- * Since alloc_one_pte_map can drop and re-acquire
- * page_table_lock, we should re-check the src entry...
- */
- if (src) {
- if (dst) {
- pte_t pte;
- pte = ptep_clear_flush(vma, old_addr, src);
-
- /* ZERO_PAGE can be dependant on virtual addr */
- pte = move_pte(pte, new_vma->vm_page_prot,
- old_addr, new_addr);
- set_pte_at(mm, new_addr, dst, pte);
- } else
- error = -ENOMEM;
- pte_unmap_nested(src);
- }
- if (dst)
- pte_unmap(dst);
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_sem prevents deadlock.
+ */
+ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock(new_ptl);
+
+ for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+ new_pte++, new_addr += PAGE_SIZE) {
+ if (pte_none(*old_pte))
+ continue;
+ pte = ptep_clear_flush(vma, old_addr, old_pte);
+ /* ZERO_PAGE can be dependant on virtual addr */
+ pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+ set_pte_at(mm, new_addr, new_pte, pte);
}
- spin_unlock(&mm->page_table_lock);
+
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ pte_unmap_nested(new_pte - 1);
+ pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
- return error;
}
+#define LATENCY_LIMIT (64 * PAGE_SIZE)
+
static unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len)
{
- unsigned long offset;
+ unsigned long extent, next, old_end;
+ pmd_t *old_pmd, *new_pmd;
- flush_cache_range(vma, old_addr, old_addr + len);
+ old_end = old_addr + len;
+ flush_cache_range(vma, old_addr, old_end);
- /*
- * This is not the clever way to do this, but we're taking the
- * easy way out on the assumption that most remappings will be
- * only a few pages.. This also makes error recovery easier.
- */
- for (offset = 0; offset < len; offset += PAGE_SIZE) {
- if (move_one_page(vma, old_addr + offset,
- new_vma, new_addr + offset) < 0)
- break;
+ for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
+ next = (old_addr + PMD_SIZE) & PMD_MASK;
+ if (next - 1 > old_end)
+ next = old_end;
+ extent = next - old_addr;
+ old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ if (!old_pmd)
+ continue;
+ new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ if (!new_pmd)
+ break;
+ next = (new_addr + PMD_SIZE) & PMD_MASK;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
+ if (extent > LATENCY_LIMIT)
+ extent = LATENCY_LIMIT;
+ move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+ new_vma, new_pmd, new_addr);
}
- return offset;
+
+ return len + old_addr - old_end; /* how much done */
}
static unsigned long move_vma(struct vm_area_struct *vma,
@@ -191,6 +162,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long new_pgoff;
unsigned long moved_len;
unsigned long excess = 0;
+ unsigned long hiwater_vm;
int split = 0;
/*
@@ -229,17 +201,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
/*
- * if we failed to move page tables we still do total_vm increment
- * since do_munmap() will decrement it by old_len == new_len
+ * If we failed to move page tables we still do total_vm increment
+ * since do_munmap() will decrement it by old_len == new_len.
+ *
+ * Since total_vm is about to be raised artificially high for a
+ * moment, we need to restore high watermark afterwards: if stats
+ * are taken meanwhile, total_vm and hiwater_vm appear too high.
+ * If this were a serious issue, we'd add a flag to do_munmap().
*/
+ hiwater_vm = mm->hiwater_vm;
mm->total_vm += new_len >> PAGE_SHIFT;
- __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
if (do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
+ mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
if (excess) {
@@ -269,6 +248,7 @@ unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
+ struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
@@ -309,7 +289,7 @@ unsigned long do_mremap(unsigned long addr,
if ((addr <= new_addr) && (addr+old_len) > new_addr)
goto out;
- ret = do_munmap(current->mm, new_addr, new_len);
+ ret = do_munmap(mm, new_addr, new_len);
if (ret)
goto out;
}
@@ -320,7 +300,7 @@ unsigned long do_mremap(unsigned long addr,
* do_munmap does all the needed commit accounting
*/
if (old_len >= new_len) {
- ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
+ ret = do_munmap(mm, addr+new_len, old_len - new_len);
if (ret && old_len != new_len)
goto out;
ret = addr;
@@ -333,7 +313,7 @@ unsigned long do_mremap(unsigned long addr,
* Ok, we need to grow.. or relocate.
*/
ret = -EFAULT;
- vma = find_vma(current->mm, addr);
+ vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
goto out;
if (is_vm_hugetlb_page(vma)) {
@@ -349,14 +329,14 @@ unsigned long do_mremap(unsigned long addr,
}
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
- locked = current->mm->locked_vm << PAGE_SHIFT;
+ locked = mm->locked_vm << PAGE_SHIFT;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
locked += new_len - old_len;
ret = -EAGAIN;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
goto out;
}
- if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) {
+ if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
ret = -ENOMEM;
goto out;
}
@@ -383,11 +363,10 @@ unsigned long do_mremap(unsigned long addr,
vma_adjust(vma, vma->vm_start,
addr + new_len, vma->vm_pgoff, NULL);
- current->mm->total_vm += pages;
- __vm_stat_account(vma->vm_mm, vma->vm_flags,
- vma->vm_file, pages);
+ mm->total_vm += pages;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
if (vma->vm_flags & VM_LOCKED) {
- current->mm->locked_vm += pages;
+ mm->locked_vm += pages;
make_pages_present(addr + old_len,
addr + new_len);
}
diff --git a/mm/msync.c b/mm/msync.c
index d0f5a1bce7cb..0e040e9c39d8 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -17,40 +17,48 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
-/*
- * Called with mm->page_table_lock held to protect against other
- * threads/the swapper from ripping pte's out from under us.
- */
-
-static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end)
{
pte_t *pte;
+ spinlock_t *ptl;
+ int progress = 0;
- pte = pte_offset_map(pmd, addr);
+again:
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
unsigned long pfn;
struct page *page;
+ if (progress >= 64) {
+ progress = 0;
+ if (need_resched() || need_lockbreak(ptl))
+ break;
+ }
+ progress++;
if (!pte_present(*pte))
continue;
if (!pte_maybe_dirty(*pte))
continue;
pfn = pte_pfn(*pte);
- if (!pfn_valid(pfn))
+ if (unlikely(!pfn_valid(pfn))) {
+ print_bad_pte(vma, *pte, addr);
continue;
+ }
page = pfn_to_page(pfn);
- if (PageReserved(page))
- continue;
if (ptep_clear_flush_dirty(vma, addr, pte) ||
page_test_and_clear_dirty(page))
set_page_dirty(page);
+ progress += 3;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ if (addr != end)
+ goto again;
}
-static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end)
{
pmd_t *pmd;
@@ -61,11 +69,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- sync_pte_range(vma, pmd, addr, next);
+ msync_pte_range(vma, pmd, addr, next);
} while (pmd++, addr = next, addr != end);
}
-static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end)
{
pud_t *pud;
@@ -76,58 +84,34 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- sync_pmd_range(vma, pud, addr, next);
+ msync_pmd_range(vma, pud, addr, next);
} while (pud++, addr = next, addr != end);
}
-static void sync_page_range(struct vm_area_struct *vma,
+static void msync_page_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
- struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
/* For hugepages we can't go walking the page table normally,
* but that's ok, hugetlbfs is memory based, so we don't need
- * to do anything more on an msync() */
- if (is_vm_hugetlb_page(vma))
+ * to do anything more on an msync().
+ * Can't do anything with VM_RESERVED regions either.
+ */
+ if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
return;
BUG_ON(addr >= end);
- pgd = pgd_offset(mm, addr);
+ pgd = pgd_offset(vma->vm_mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- sync_pud_range(vma, pgd, addr, next);
+ msync_pud_range(vma, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
-}
-
-#ifdef CONFIG_PREEMPT
-static inline void filemap_sync(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- const size_t chunk = 64 * 1024; /* bytes */
- unsigned long next;
-
- do {
- next = addr + chunk;
- if (next > end || next < addr)
- next = end;
- sync_page_range(vma, addr, next);
- cond_resched();
- } while (addr = next, addr != end);
-}
-#else
-static inline void filemap_sync(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- sync_page_range(vma, addr, end);
}
-#endif
/*
* MS_SYNC syncs the entire file - including mappings.
@@ -150,7 +134,7 @@ static int msync_interval(struct vm_area_struct *vma,
return -EBUSY;
if (file && (vma->vm_flags & VM_SHARED)) {
- filemap_sync(vma, addr, end);
+ msync_page_range(vma, addr, end);
if (flags & MS_SYNC) {
struct address_space *mapping = file->f_mapping;
diff --git a/mm/nommu.c b/mm/nommu.c
index 0ef241ae3763..d1e076a487cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
realalloc -= kobjsize(vml);
askedalloc -= sizeof(*vml);
kfree(vml);
+
+ update_hiwater_vm(mm);
mm->total_vm -= len >> PAGE_SHIFT;
#ifdef DEBUG
@@ -1047,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
-struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+ unsigned int foll_flags)
{
return NULL;
}
@@ -1078,19 +1081,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
{
}
-void update_mem_hiwater(struct task_struct *tsk)
-{
- unsigned long rss;
-
- if (likely(tsk->mm)) {
- rss = get_mm_counter(tsk->mm, rss);
- if (tsk->mm->hiwater_rss < rss)
- tsk->mm->hiwater_rss = rss;
- if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
- tsk->mm->hiwater_vm = tsk->mm->total_vm;
- }
-}
-
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen,
int even_cows)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e1d3d77f4aee..2dbdd98426fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
@@ -78,21 +79,44 @@ int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
+{
+ int ret = 0;
+ unsigned seq;
+ unsigned long pfn = page_to_pfn(page);
+
+ do {
+ seq = zone_span_seqbegin(zone);
+ if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+ ret = 1;
+ else if (pfn < zone->zone_start_pfn)
+ ret = 1;
+ } while (zone_span_seqretry(zone, seq));
+
+ return ret;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+ if (!pfn_valid(page_to_pfn(page)))
+ return 0;
+#endif
+ if (zone != page_zone(page))
+ return 0;
+
+ return 1;
+}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
static int bad_range(struct zone *zone, struct page *page)
{
- if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+ if (page_outside_zone_boundaries(zone, page))
return 1;
- if (page_to_pfn(page) < zone->zone_start_pfn)
- return 1;
-#ifdef CONFIG_HOLES_IN_ZONE
- if (!pfn_valid(page_to_pfn(page)))
- return 1;
-#endif
- if (zone != page_zone(page))
+ if (!page_is_consistent(zone, page))
return 1;
+
return 0;
}
@@ -114,7 +138,8 @@ static void bad_page(const char *function, struct page *page)
1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
- 1 << PG_writeback);
+ 1 << PG_writeback |
+ 1 << PG_reserved );
set_page_count(page, 0);
reset_page_mapcount(page);
page->mapping = NULL;
@@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
struct page *p = page + i;
SetPageCompound(p);
- p->private = (unsigned long)page;
+ set_page_private(p, (unsigned long)page);
}
}
@@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (!PageCompound(p))
bad_page(__FUNCTION__, page);
- if (p->private != (unsigned long)page)
+ if (page_private(p) != (unsigned long)page)
bad_page(__FUNCTION__, page);
ClearPageCompound(p);
}
@@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
* So, we don't need atomic page->flags operations here.
*/
static inline unsigned long page_order(struct page *page) {
- return page->private;
+ return page_private(page);
}
static inline void set_page_order(struct page *page, int order) {
- page->private = order;
+ set_page_private(page, order);
__SetPagePrivate(page);
}
static inline void rmv_page_order(struct page *page)
{
__ClearPagePrivate(page);
- page->private = 0;
+ set_page_private(page, 0);
}
/*
@@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (a) the buddy is free &&
* (b) the buddy is on the buddy system &&
* (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
*
*/
static inline int page_is_buddy(struct page *page, int order)
{
if (PagePrivate(page) &&
(page_order(page) == order) &&
- !PageReserved(page) &&
page_count(page) == 0)
return 1;
return 0;
@@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order)
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
@@ -327,7 +351,8 @@ static inline void free_pages_check(const char *function, struct page *page)
1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
- 1 << PG_writeback )))
+ 1 << PG_writeback |
+ 1 << PG_reserved )))
bad_page(function, page);
if (PageDirty(page))
__ClearPageDirty(page);
@@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order)
1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
- 1 << PG_writeback )))
+ 1 << PG_writeback |
+ 1 << PG_reserved )))
bad_page(__FUNCTION__, page);
page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk);
- page->private = 0;
+ set_page_private(page, 0);
set_page_refs(page, order);
kernel_map_pages(page, 1 << order, 1);
}
@@ -734,7 +760,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
* of the allocation.
*/
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int can_try_harder, int gfp_high)
+ int classzone_idx, int can_try_harder, gfp_t gfp_high)
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
@@ -777,7 +803,7 @@ struct page * fastcall
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
- const int wait = gfp_mask & __GFP_WAIT;
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
struct zone **zones, *z;
struct page *page;
struct reclaim_state reclaim_state;
@@ -996,7 +1022,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
* get_zeroed_page() returns a 32-bit address, which cannot represent
* a highmem page
*/
- BUG_ON(gfp_mask & __GFP_HIGHMEM);
+ BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
if (page)
@@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec)
fastcall void __free_pages(struct page *page, unsigned int order)
{
- if (!PageReserved(page) && put_page_testzero(page)) {
+ if (put_page_testzero(page)) {
if (order == 0)
free_hot_page(page);
else
@@ -1089,7 +1115,7 @@ static unsigned int nr_free_zone_pages(int offset)
*/
unsigned int nr_free_buffer_pages(void)
{
- return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
+ return nr_free_zone_pages(gfp_zone(GFP_USER));
}
/*
@@ -1097,7 +1123,7 @@ unsigned int nr_free_buffer_pages(void)
*/
unsigned int nr_free_pagecache_pages(void)
{
- return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+ return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
}
#ifdef CONFIG_HIGHMEM
@@ -1305,12 +1331,9 @@ void show_free_areas(void)
} else
printk("\n");
- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ for_each_cpu(cpu) {
struct per_cpu_pageset *pageset;
- if (!cpu_possible(cpu))
- continue;
-
pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
@@ -1428,6 +1451,16 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
return j;
}
+static inline int highest_zone(int zone_bits)
+{
+ int res = ZONE_NORMAL;
+ if (zone_bits & (__force int)__GFP_HIGHMEM)
+ res = ZONE_HIGHMEM;
+ if (zone_bits & (__force int)__GFP_DMA)
+ res = ZONE_DMA;
+ return res;
+}
+
#ifdef CONFIG_NUMA
#define MAX_NODE_LOAD (num_online_nodes())
static int __initdata node_load[MAX_NUMNODES];
@@ -1524,11 +1557,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
zonelist = pgdat->node_zonelists + i;
for (j = 0; zonelist->zones[j] != NULL; j++);
- k = ZONE_NORMAL;
- if (i & __GFP_HIGHMEM)
- k = ZONE_HIGHMEM;
- if (i & __GFP_DMA)
- k = ZONE_DMA;
+ k = highest_zone(i);
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
zonelist->zones[j] = NULL;
@@ -1549,12 +1578,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
zonelist = pgdat->node_zonelists + i;
j = 0;
- k = ZONE_NORMAL;
- if (i & __GFP_HIGHMEM)
- k = ZONE_HIGHMEM;
- if (i & __GFP_DMA)
- k = ZONE_DMA;
-
+ k = highest_zone(i);
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1659,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
-void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
struct page *page;
@@ -1673,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
continue;
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
- set_page_count(page, 0);
+ set_page_count(page, 1);
reset_page_mapcount(page);
SetPageReserved(page);
INIT_LIST_HEAD(&page->lru);
@@ -1720,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone)
/*
* The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/4 of a meg - there's
- * no point in going beyond the size of L2 cache.
+ * size of the zone. But no more than 1/2 of a meg.
*
* OK, so we don't know how big the cache is. So guess.
*/
batch = zone->present_pages / 1024;
- if (batch * PAGE_SIZE > 256 * 1024)
- batch = (256 * 1024) / PAGE_SIZE;
+ if (batch * PAGE_SIZE > 512 * 1024)
+ batch = (512 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
/*
- * Clamp the batch to a 2^n - 1 value. Having a power
- * of 2 value was found to be more likely to have
- * suboptimal cache aliasing properties in some cases.
+ * We will be trying to allcoate bigger chunks of contiguous
+ * memory of the order of fls(batch). This should result in
+ * better cache coloring.
*
- * For example if 2 tasks are alternately allocating
- * batches of pages, one task can end up with a lot
- * of pages of one half of the possible page colors
- * and the other with pages of the other colors.
+ * A sanity check also to ensure that batch is still in limits.
*/
- batch = (1 << fls(batch + batch/2)) - 1;
+ batch = (1 << fls(batch + batch/2));
+
+ if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
+ batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
+
return batch;
}
@@ -1754,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
pcp = &p->pcp[0]; /* hot */
pcp->count = 0;
- pcp->low = 2 * batch;
+ pcp->low = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&pcp->list);
@@ -1763,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
pcp->count = 0;
pcp->low = 0;
pcp->high = 2 * batch;
- pcp->batch = max(1UL, 1 * batch);
+ pcp->batch = max(1UL, batch/2);
INIT_LIST_HEAD(&pcp->list);
}
@@ -1872,6 +1896,60 @@ void __init setup_per_cpu_pageset()
#endif
+static __devinit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+ int i;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+
+ /*
+ * The per-page waitqueue mechanism uses hashed waitqueues
+ * per zone.
+ */
+ zone->wait_table_size = wait_table_size(zone_size_pages);
+ zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, zone->wait_table_size
+ * sizeof(wait_queue_head_t));
+
+ for(i = 0; i < zone->wait_table_size; ++i)
+ init_waitqueue_head(zone->wait_table + i);
+}
+
+static __devinit void zone_pcp_init(struct zone *zone)
+{
+ int cpu;
+ unsigned long batch = zone_batchsize(zone);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+ /* Early boot. Slab allocator not functional yet */
+ zone->pageset[cpu] = &boot_pageset[cpu];
+ setup_pageset(&boot_pageset[cpu],0);
+#else
+ setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+ }
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
+ zone->name, zone->present_pages, batch);
+}
+
+static __devinit void init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn, unsigned long size)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+
+ zone_wait_table_init(zone, size);
+ pgdat->nr_zones = zone_idx(zone) + 1;
+
+ zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+ zone->zone_start_pfn = zone_start_pfn;
+
+ memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+ zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1881,10 +1959,11 @@ void __init setup_per_cpu_pageset()
static void __init free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
- unsigned long i, j;
- int cpu, nid = pgdat->node_id;
+ unsigned long j;
+ int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
+ pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
@@ -1892,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize;
- unsigned long batch;
realsize = size = zones_size[j];
if (zholes_size)
@@ -1907,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
+ zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
- batch = zone_batchsize(zone);
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
- /* Early boot. Slab allocator not functional yet */
- zone->pageset[cpu] = &boot_pageset[cpu];
- setup_pageset(&boot_pageset[cpu],0);
-#else
- setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
- }
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone_names[j], realsize, batch);
+ zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
zone->nr_scan_active = 0;
@@ -1935,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
if (!size)
continue;
- /*
- * The per-page waitqueue mechanism uses hashed waitqueues
- * per zone.
- */
- zone->wait_table_size = wait_table_size(size);
- zone->wait_table_bits =
- wait_table_bits(zone->wait_table_size);
- zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, zone->wait_table_size
- * sizeof(wait_queue_head_t));
-
- for(i = 0; i < zone->wait_table_size; ++i)
- init_waitqueue_head(zone->wait_table + i);
-
- pgdat->nr_zones = j+1;
-
- zone->zone_mem_map = pfn_to_page(zone_start_pfn);
- zone->zone_start_pfn = zone_start_pfn;
-
- memmap_init(size, nid, j, zone_start_pfn);
-
zonetable_add(zone, nid, j, zone_start_pfn, size);
-
+ init_currently_empty_zone(zone, zone_start_pfn, size);
zone_start_pfn += size;
-
- zone_init_free_lists(pgdat, zone, zone->spanned_pages);
}
}
@@ -2360,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void)
* that the pages_{min,low,high} values for each zone are set correctly
* with respect to min_free_kbytes.
*/
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
diff --git a/mm/page_io.c b/mm/page_io.c
index 330e00d6db00..bb2b0d53889c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
- bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+ bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+ end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
BUG_ON(!PageLocked(page));
ClearPageUptodate(page);
- bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+ bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+ end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
ret = -ENOMEM;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index d6781951267e..52822c98c489 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -20,6 +20,7 @@
#include <linux/fs.h> // Needed by writeback.h
#include <linux/writeback.h> // Prototypes pdflush_operation()
#include <linux/kthread.h>
+#include <linux/cpuset.h>
/*
@@ -170,12 +171,24 @@ static int __pdflush(struct pdflush_work *my_work)
static int pdflush(void *dummy)
{
struct pdflush_work my_work;
+ cpumask_t cpus_allowed;
/*
* pdflush can spend a lot of time doing encryption via dm-crypt. We
* don't want to do that at keventd's priority.
*/
set_user_nice(current, 0);
+
+ /*
+ * Some configs put our parent kthread in a limited cpuset,
+ * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
+ * Our needs are more modest - cut back to our cpusets cpus_allowed.
+ * This is needed as pdflush's are dynamically created and destroyed.
+ * The boottime pdflush's are easily placed w/o these 2 lines.
+ */
+ cpus_allowed = cpuset_cpus_allowed(current);
+ set_cpus_allowed(current, cpus_allowed);
+
return __pdflush(&my_work);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 450f5241b5a5..914d04b98bee 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,7 @@
* page->flags PG_locked (lock_page)
* mapping->i_mmap_lock
* anon_vma->lock
- * mm->page_table_lock
+ * mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
@@ -244,37 +244,44 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
/*
* Check that @page is mapped at @address into @mm.
*
- * On success returns with mapped pte and locked mm->page_table_lock.
+ * On success returns with pte mapped and locked.
*/
pte_t *page_check_address(struct page *page, struct mm_struct *mm,
- unsigned long address)
+ unsigned long address, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ spinlock_t *ptl;
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- spin_lock(&mm->page_table_lock);
pgd = pgd_offset(mm, address);
- if (likely(pgd_present(*pgd))) {
- pud = pud_offset(pgd, address);
- if (likely(pud_present(*pud))) {
- pmd = pmd_offset(pud, address);
- if (likely(pmd_present(*pmd))) {
- pte = pte_offset_map(pmd, address);
- if (likely(pte_present(*pte) &&
- page_to_pfn(page) == pte_pfn(*pte)))
- return pte;
- pte_unmap(pte);
- }
- }
+ if (!pgd_present(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return NULL;
+
+ pte = pte_offset_map(pmd, address);
+ /* Make a quick check before getting the lock */
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return NULL;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
+ *ptlp = ptl;
+ return pte;
}
- spin_unlock(&mm->page_table_lock);
- return ERR_PTR(-ENOENT);
+ pte_unmap_unlock(pte, ptl);
+ return NULL;
}
/*
@@ -287,24 +294,28 @@ static int page_referenced_one(struct page *page,
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *pte;
+ spinlock_t *ptl;
int referenced = 0;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address);
- if (!IS_ERR(pte)) {
- if (ptep_clear_flush_young(vma, address, pte))
- referenced++;
+ pte = page_check_address(page, mm, address, &ptl);
+ if (!pte)
+ goto out;
- if (mm != current->mm && !ignore_token && has_swap_token(mm))
- referenced++;
+ if (ptep_clear_flush_young(vma, address, pte))
+ referenced++;
- (*mapcount)--;
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
- }
+ /* Pretend the page is referenced if the task has the
+ swap token and is in the middle of a page fault. */
+ if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
+ rwsem_is_locked(&mm->mmap_sem))
+ referenced++;
+
+ (*mapcount)--;
+ pte_unmap_unlock(pte, ptl);
out:
return referenced;
}
@@ -434,15 +445,11 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- BUG_ON(PageReserved(page));
-
- inc_mm_counter(vma->vm_mm, anon_rss);
-
if (atomic_inc_and_test(&page->_mapcount)) {
struct anon_vma *anon_vma = vma->anon_vma;
@@ -461,13 +468,12 @@ void page_add_anon_rmap(struct page *page,
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_add_file_rmap(struct page *page)
{
BUG_ON(PageAnon(page));
- if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
- return;
+ BUG_ON(!pfn_valid(page_to_pfn(page)));
if (atomic_inc_and_test(&page->_mapcount))
inc_page_state(nr_mapped);
@@ -477,12 +483,10 @@ void page_add_file_rmap(struct page *page)
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
*
- * Caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_remove_rmap(struct page *page)
{
- BUG_ON(PageReserved(page));
-
if (atomic_add_negative(-1, &page->_mapcount)) {
BUG_ON(page_mapcount(page) < 0);
/*
@@ -510,14 +514,15 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
unsigned long address;
pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
int ret = SWAP_AGAIN;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address);
- if (IS_ERR(pte))
+ pte = page_check_address(page, mm, address, &ptl);
+ if (!pte)
goto out;
/*
@@ -541,8 +546,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
if (pte_dirty(pteval))
set_page_dirty(page);
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
if (PageAnon(page)) {
- swp_entry_t entry = { .val = page->private };
+ swp_entry_t entry = { .val = page_private(page) };
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
@@ -551,21 +559,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
swap_duplicate(entry);
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
- list_add(&mm->mmlist, &init_mm.mmlist);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
dec_mm_counter(mm, anon_rss);
- }
+ } else
+ dec_mm_counter(mm, file_rss);
- dec_mm_counter(mm, rss);
page_remove_rmap(page);
page_cache_release(page);
out_unmap:
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
out:
return ret;
}
@@ -599,19 +607,14 @@ static void try_to_unmap_cluster(unsigned long cursor,
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte, *original_pte;
+ pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
struct page *page;
unsigned long address;
unsigned long end;
unsigned long pfn;
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- spin_lock(&mm->page_table_lock);
-
address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE;
if (address < vma->vm_start)
@@ -621,30 +624,33 @@ static void try_to_unmap_cluster(unsigned long cursor,
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
- goto out_unlock;
+ return;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
- goto out_unlock;
+ return;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
- goto out_unlock;
+ return;
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- for (original_pte = pte = pte_offset_map(pmd, address);
- address < end; pte++, address += PAGE_SIZE) {
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+ for (; address < end; pte++, address += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
pfn = pte_pfn(*pte);
- if (!pfn_valid(pfn))
+ if (unlikely(!pfn_valid(pfn))) {
+ print_bad_pte(vma, *pte, address);
continue;
+ }
page = pfn_to_page(pfn);
BUG_ON(PageAnon(page));
- if (PageReserved(page))
- continue;
if (ptep_clear_flush_young(vma, address, pte))
continue;
@@ -663,13 +669,10 @@ static void try_to_unmap_cluster(unsigned long cursor,
page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, rss);
+ dec_mm_counter(mm, file_rss);
(*mapcount)--;
}
-
- pte_unmap(original_pte);
-out_unlock:
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte - 1, ptl);
}
static int try_to_unmap_anon(struct page *page)
@@ -806,7 +809,6 @@ int try_to_unmap(struct page *page)
{
int ret;
- BUG_ON(PageReserved(page));
BUG_ON(!PageLocked(page));
if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index ea064d89cda9..dc25565a61e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
-/* Keep swapped page count in private field of indirect struct page */
-#define nr_swapped private
-
/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
enum sgp_type {
SGP_QUICK, /* don't try more than file page cache lookup */
@@ -85,7 +82,7 @@ enum sgp_type {
static int shmem_getpage(struct inode *inode, unsigned long idx,
struct page **pagep, enum sgp_type sgp, int *type);
-static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
+static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
{
/*
* The above definition of ENTRIES_PER_PAGE, and the use of
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
entry->val = value;
info->swapped += incdec;
- if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
- kmap_atomic_to_page(entry)->nr_swapped += incdec;
+ if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+ struct page *page = kmap_atomic_to_page(entry);
+ set_page_private(page, page_private(page) + incdec);
+ }
}
/*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
spin_unlock(&info->lock);
page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
- if (page) {
- page->nr_swapped = 0;
- }
+ if (page)
+ set_page_private(page, 0);
spin_lock(&info->lock);
if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
diroff = 0;
}
subdir = dir[diroff];
- if (subdir && subdir->nr_swapped) {
+ if (subdir && page_private(subdir)) {
size = limit - idx;
if (size > ENTRIES_PER_PAGE)
size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
nr_swaps_freed += freed;
if (offset)
spin_lock(&info->lock);
- subdir->nr_swapped -= freed;
+ set_page_private(subdir, page_private(subdir) - freed);
if (offset)
spin_unlock(&info->lock);
- BUG_ON(subdir->nr_swapped > offset);
+ BUG_ON(page_private(subdir) > offset);
}
if (offset)
offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
dir = shmem_dir_map(subdir);
}
subdir = *dir;
- if (subdir && subdir->nr_swapped) {
+ if (subdir && page_private(subdir)) {
ptr = shmem_swp_map(subdir);
size = limit - idx;
if (size > ENTRIES_PER_PAGE)
@@ -898,7 +896,7 @@ struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
}
static struct page *
-shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info,
+shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
unsigned long idx)
{
struct vm_area_struct pvma;
@@ -1201,7 +1199,7 @@ static int shmem_populate(struct vm_area_struct *vma,
page_cache_release(page);
return err;
}
- } else {
+ } else if (vma->vm_flags & VM_NONLINEAR) {
/* No page was found just because we can't read it in
* now (being here implies nonblock != 0), but the page
* may exist, so set the PTE to fault it in later. */
@@ -1506,8 +1504,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
*/
if (!offset)
mark_page_accessed(page);
- } else
+ } else {
page = ZERO_PAGE(0);
+ page_cache_get(page);
+ }
/*
* Ok, we have the page, and it's up-to-date, so
diff --git a/mm/slab.c b/mm/slab.c
index d05c678bceb3..22bfb0b2ac8b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -386,7 +386,7 @@ struct kmem_cache_s {
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
- unsigned int gfpflags;
+ gfp_t gfpflags;
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
@@ -2117,7 +2117,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
slabp->free = 0;
}
-static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags)
+static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
{
if (flags & SLAB_DMA) {
if (!(cachep->gfpflags & GFP_DMA))
@@ -2152,7 +2152,7 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
struct slab *slabp;
void *objp;
size_t offset;
- unsigned int local_flags;
+ gfp_t local_flags;
unsigned long ctor_flags;
struct kmem_list3 *l3;
@@ -2419,6 +2419,7 @@ retry:
next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+ WARN_ON(numa_node_id() != slabp->nodeid);
#endif
slabp->free = next;
}
@@ -2546,7 +2547,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
/*
* A interface to enable slab creation on nodeid
*/
-static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
+static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
struct list_head *entry;
struct slab *slabp;
@@ -2633,8 +2634,10 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
check_spinlock_acquired_node(cachep, node);
check_slabp(cachep, slabp);
-
#if DEBUG
+ /* Verify that the slab belongs to the intended node */
+ WARN_ON(slabp->nodeid != node);
+
if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
printk(KERN_ERR "slab: double free detected in cache "
"'%s', objp %p\n", cachep->name, objp);
diff --git a/mm/sparse.c b/mm/sparse.c
index 347249a4917a..72079b538e2d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,8 +5,10 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
#include <asm/dma.h>
/*
@@ -72,6 +74,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
}
#endif
+/*
+ * Although written for the SPARSEMEM_EXTREME case, this happens
+ * to also work for the flat array case becase
+ * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
+ */
+int __section_nr(struct mem_section* ms)
+{
+ unsigned long root_nr;
+ struct mem_section* root;
+
+ for (root_nr = 0;
+ root_nr < NR_MEM_SECTIONS;
+ root_nr += SECTIONS_PER_ROOT) {
+ root = __nr_to_section(root_nr);
+
+ if (!root)
+ continue;
+
+ if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
+ break;
+ }
+
+ return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
+}
+
/* Record a memory area against a node. */
void memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -162,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
return NULL;
}
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+ struct page *page, *ret;
+ unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+ page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+ if (page)
+ goto got_map_page;
+
+ ret = vmalloc(memmap_size);
+ if (ret)
+ goto got_map_ptr;
+
+ return NULL;
+got_map_page:
+ ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+ memset(ret, 0, memmap_size);
+
+ return ret;
+}
+
+static int vaddr_in_vmalloc_area(void *addr)
+{
+ if (addr >= (void *)VMALLOC_START &&
+ addr < (void *)VMALLOC_END)
+ return 1;
+ return 0;
+}
+
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+ if (vaddr_in_vmalloc_area(memmap))
+ vfree(memmap);
+ else
+ free_pages((unsigned long)memmap,
+ get_order(sizeof(struct page) * nr_pages));
+}
+
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
@@ -187,14 +253,37 @@ void sparse_init(void)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+ int nr_pages)
{
- struct mem_section *ms = __pfn_to_section(start_pfn);
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ struct mem_section *ms;
+ struct page *memmap;
+ unsigned long flags;
+ int ret;
- if (ms->section_mem_map & SECTION_MARKED_PRESENT)
- return -EEXIST;
+ /*
+ * no locking for this, because it does its own
+ * plus, it does a kmalloc
+ */
+ sparse_index_init(section_nr, pgdat->node_id);
+ memmap = __kmalloc_section_memmap(nr_pages);
+
+ pgdat_resize_lock(pgdat, &flags);
+ ms = __pfn_to_section(start_pfn);
+ if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
+ ret = -EEXIST;
+ goto out;
+ }
ms->section_mem_map |= SECTION_MARKED_PRESENT;
- return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+ ret = sparse_init_one_section(ms, section_nr, memmap);
+
+ if (ret <= 0)
+ __kfree_section_memmap(memmap, nr_pages);
+out:
+ pgdat_resize_unlock(pgdat, &flags);
+ return ret;
}
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803f62..96387e20184a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
void put_page(struct page *page)
{
if (unlikely(PageCompound(page))) {
- page = (struct page *)page->private;
+ page = (struct page *)page_private(page);
if (put_page_testzero(page)) {
void (*dtor)(struct page *page);
@@ -48,7 +48,7 @@ void put_page(struct page *page)
}
return;
}
- if (!PageReserved(page) && put_page_testzero(page))
+ if (put_page_testzero(page))
__page_cache_release(page);
}
EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
struct page *page = pages[i];
struct zone *pagezone;
- if (PageReserved(page) || !put_page_testzero(page))
+ if (!put_page_testzero(page))
continue;
pagezone = page_zone(page);
@@ -270,7 +270,6 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
struct pagevec pages_to_free;
pagevec_init(&pages_to_free, pvec->cold);
- pages_to_free.cold = pvec->cold;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 132164f7d0a7..dfd9a46755b8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
page_cache_get(page);
SetPageLocked(page);
SetPageSwapCache(page);
- page->private = entry.val;
+ set_page_private(page, entry.val);
total_swapcache_pages++;
pagecache_acct(1);
}
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
BUG_ON(PageWriteback(page));
BUG_ON(PagePrivate(page));
- radix_tree_delete(&swapper_space.page_tree, page->private);
- page->private = 0;
+ radix_tree_delete(&swapper_space.page_tree, page_private(page));
+ set_page_private(page, 0);
ClearPageSwapCache(page);
total_swapcache_pages--;
pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
{
swp_entry_t entry;
- entry.val = page->private;
+ entry.val = page_private(page);
write_lock_irq(&swapper_space.tree_lock);
__delete_from_swap_cache(page);
@@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page)
/*
* Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page. Can not do a lock_page,
- * as we are holding the page_table_lock spinlock.
+ * this page if it is the last user of the page.
*/
void free_page_and_swap_cache(struct page *page)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1dcaeda039f4..8970c0b74194 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
swp_entry_t entry;
down_read(&swap_unplug_sem);
- entry.val = page->private;
+ entry.val = page_private(page);
if (PageSwapCache(page)) {
struct block_device *bdev = swap_info[swp_type(entry)].bdev;
struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
/*
* If the page is removed from swapcache from under us (with a
* racy try_to_unuse/swapoff) we need an additional reference
- * count to avoid reading garbage from page->private above. If
- * the WARN_ON triggers during a swapoff it maybe the race
+ * count to avoid reading garbage from page_private(page) above.
+ * If the WARN_ON triggers during a swapoff it maybe the race
* condition and it's harmless. However if it triggers without
* swapoff it signals a problem.
*/
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
struct swap_info_struct *p;
swp_entry_t entry;
- entry.val = page->private;
+ entry.val = page_private(page);
p = swap_info_get(entry);
if (p) {
/* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
if (page_count(page) != 2) /* 2: us + cache */
return 0;
- entry.val = page->private;
+ entry.val = page_private(page);
p = swap_info_get(entry);
if (!p)
return 0;
@@ -398,17 +398,14 @@ void free_swap_and_cache(swp_entry_t entry)
}
/*
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited). We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
- *
- * vma->vm_mm->page_table_lock is held.
+ * No need to decide whether this PTE shares the swap entry with others,
+ * just let do_wp_page work it out if a write is requested later - to
+ * force COW, vm_page_prot omits write permission from any private vma.
*/
static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- inc_mm_counter(vma->vm_mm, rss);
+ inc_mm_counter(vma->vm_mm, anon_rss);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -425,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
swp_entry_t entry, struct page *page)
{
- pte_t *pte;
pte_t swp_pte = swp_entry_to_pte(entry);
+ pte_t *pte;
+ spinlock_t *ptl;
+ int found = 0;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
/*
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
if (unlikely(pte_same(*pte, swp_pte))) {
- unuse_pte(vma, pte, addr, entry, page);
- pte_unmap(pte);
- return 1;
+ unuse_pte(vma, pte++, addr, entry, page);
+ found = 1;
+ break;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
- return 0;
+ pte_unmap_unlock(pte - 1, ptl);
+ return found;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -523,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm,
down_read(&mm->mmap_sem);
lock_page(page);
}
- spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma && unuse_vma(vma, entry, page))
break;
}
- spin_unlock(&mm->page_table_lock);
up_read(&mm->mmap_sem);
/*
* Currently unuse_mm cannot fail, but leave error handling
@@ -1045,7 +1042,7 @@ int page_queue_congested(struct page *page)
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page->private };
+ swp_entry_t entry = { .val = page_private(page) };
struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry));
diff --git a/mm/thrash.c b/mm/thrash.c
index 11461f7ad830..eff3c18c33a1 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -19,7 +19,7 @@ static unsigned long swap_token_check;
struct mm_struct * swap_token_mm = &init_mm;
#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT 0
+#define SWAP_TOKEN_TIMEOUT (300 * HZ)
/*
* Currently disabled; Needs further code to work at HZ * 300.
*/
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index c13a2161bca2..b58abcf44ed6 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -31,11 +31,14 @@ static struct vfsmount *shm_mnt;
static int __init init_tmpfs(void)
{
- register_filesystem(&tmpfs_fs_type);
+ BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+
#ifdef CONFIG_TMPFS
devfs_mk_dir("shm");
#endif
shm_mnt = kern_mount(&tmpfs_fs_type);
+ BUG_ON(IS_ERR(shm_mnt));
+
return 0;
}
module_init(init_tmpfs)
diff --git a/mm/truncate.c b/mm/truncate.c
index 60c8764bfac2..29c18f68dc35 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -13,18 +13,9 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
- block_invalidatepage */
+ do_invalidatepage */
-static int do_invalidatepage(struct page *page, unsigned long offset)
-{
- int (*invalidatepage)(struct page *, unsigned long);
- invalidatepage = page->mapping->a_ops->invalidatepage;
- if (invalidatepage == NULL)
- invalidatepage = block_invalidatepage;
- return (*invalidatepage)(page, offset);
-}
-
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1150229b6366..54a90e83cb31 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5,6 +5,7 @@
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
+ * Numa awareness, Christoph Lameter, SGI, June 2005
*/
#include <linux/mm.h>
@@ -88,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
{
pte_t *pte;
- pte = pte_alloc_kernel(&init_mm, pmd, addr);
+ pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
@@ -146,20 +147,18 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
- spin_lock(&init_mm.page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages);
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&init_mm.page_table_lock);
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end)
+struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end, int node)
{
struct vm_struct **p, *tmp, *area;
unsigned long align = 1;
@@ -178,7 +177,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
addr = ALIGN(start, align);
size = PAGE_ALIGN(size);
- area = kmalloc(sizeof(*area), GFP_KERNEL);
+ area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
if (unlikely(!area))
return NULL;
@@ -231,6 +230,12 @@ out:
return NULL;
}
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end)
+{
+ return __get_vm_area_node(size, flags, start, end, -1);
+}
+
/**
* get_vm_area - reserve a contingous kernel virtual area
*
@@ -246,6 +251,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
}
+struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
+{
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
+}
+
/* Caller must hold vmlist_lock */
struct vm_struct *__remove_vm_area(void *addr)
{
@@ -342,7 +352,6 @@ void vfree(void *addr)
BUG_ON(in_interrupt());
__vunmap(addr, 1);
}
-
EXPORT_SYMBOL(vfree);
/**
@@ -360,7 +369,6 @@ void vunmap(void *addr)
BUG_ON(in_interrupt());
__vunmap(addr, 0);
}
-
EXPORT_SYMBOL(vunmap);
/**
@@ -392,10 +400,10 @@ void *vmap(struct page **pages, unsigned int count,
return area->addr;
}
-
EXPORT_SYMBOL(vmap);
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+ pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
@@ -406,9 +414,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE)
- pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL);
+ pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
else
- pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
+ pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
@@ -418,7 +426,10 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
memset(area->pages, 0, array_size);
for (i = 0; i < area->nr_pages; i++) {
- area->pages[i] = alloc_page(gfp_mask);
+ if (node < 0)
+ area->pages[i] = alloc_page(gfp_mask);
+ else
+ area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!area->pages[i])) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
@@ -435,18 +446,25 @@ fail:
return NULL;
}
+void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
+{
+ return __vmalloc_area_node(area, gfp_mask, prot, -1);
+}
+
/**
- * __vmalloc - allocate virtually contiguous memory
+ * __vmalloc_node - allocate virtually contiguous memory
*
* @size: allocation size
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
+ * @node node to use for allocation or -1
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+ int node)
{
struct vm_struct *area;
@@ -454,13 +472,18 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
if (!size || (size >> PAGE_SHIFT) > num_physpages)
return NULL;
- area = get_vm_area(size, VM_ALLOC);
+ area = get_vm_area_node(size, VM_ALLOC, node);
if (!area)
return NULL;
- return __vmalloc_area(area, gfp_mask, prot);
+ return __vmalloc_area_node(area, gfp_mask, prot, node);
}
+EXPORT_SYMBOL(__vmalloc_node);
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+{
+ return __vmalloc_node(size, gfp_mask, prot, -1);
+}
EXPORT_SYMBOL(__vmalloc);
/**
@@ -478,9 +501,26 @@ void *vmalloc(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
}
-
EXPORT_SYMBOL(vmalloc);
+/**
+ * vmalloc_node - allocate memory on a specific node
+ *
+ * @size: allocation size
+ * @node; numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight cotrol over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vmalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+}
+EXPORT_SYMBOL(vmalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -515,7 +555,6 @@ void *vmalloc_32(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
}
-
EXPORT_SYMBOL(vmalloc_32);
long vread(char *buf, char *addr, unsigned long count)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64f9570cff56..135bf8ca96ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -70,7 +70,7 @@ struct scan_control {
unsigned int priority;
/* This context's GFP mask */
- unsigned int gfp_mask;
+ gfp_t gfp_mask;
int may_writepage;
@@ -186,7 +186,7 @@ EXPORT_SYMBOL(remove_shrinker);
*
* Returns the number of slab objects which we shrunk.
*/
-static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
+static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
unsigned long lru_pages)
{
struct shrinker *shrinker;
@@ -417,7 +417,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
+ if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!sc->may_swap)
+ goto keep_locked;
if (!add_to_swap(page))
goto activate_locked;
}
@@ -519,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page->private };
+ swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);
@@ -926,7 +928,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
+int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
{
int priority;
int ret = 0;
@@ -1338,7 +1340,7 @@ module_init(kswapd_init)
/*
* Try to free up some pages from this zone through reclaim.
*/
-int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
struct scan_control sc;
int nr_pages = 1 << order;