From 6047a007d0f6b7395cd158f3bdda34ab39a48821 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 14 Jan 2009 12:22:25 +0200
Subject: SLUB: Use ->objsize from struct kmem_cache_cpu in slab_free()

There's no reason to use ->objsize from struct kmem_cache in slab_free() for
the SLAB_DEBUG_OBJECTS case. All it does is generate extra cache pressure as we
try very hard not to touch struct kmem_cache in the fast-path.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 6392ae5cc6b1..f21e25ad453b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1724,7 +1724,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
 	c = get_cpu_slab(s, smp_processor_id());
 	debug_check_no_locks_freed(object, c->objsize);
 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
-		debug_check_no_obj_freed(object, s->objsize);
+		debug_check_no_obj_freed(object, c->objsize);
 	if (likely(page == c->page && c->node >= 0)) {
 		object[c->offset] = c->freelist;
 		c->freelist = object;
-- 
cgit v1.2.3


From 6e9ed0cc4b963fde66ab47d9fb19147631e44555 Mon Sep 17 00:00:00 2001
From: Américo Wang <xiyou.wangcong@gmail.com>
Date: Mon, 19 Jan 2009 02:00:38 +0800
Subject: slob: clean up the code

- Use NULL instead of plain 0;
- Rename slob_page() to is_slob_page();
- Define slob_page() to convert void* to struct slob_page*;
- Rename slob_new_page() to slob_new_pages();
- Define slob_free_pages() accordingly.

Compile tests only.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slob.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index bf7e8fc3aed8..c9cd31d27e69 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -126,9 +126,9 @@ static LIST_HEAD(free_slob_medium);
 static LIST_HEAD(free_slob_large);
 
 /*
- * slob_page: True for all slob pages (false for bigblock pages)
+ * is_slob_page: True for all slob pages (false for bigblock pages)
  */
-static inline int slob_page(struct slob_page *sp)
+static inline int is_slob_page(struct slob_page *sp)
 {
 	return PageSlobPage((struct page *)sp);
 }
@@ -143,6 +143,11 @@ static inline void clear_slob_page(struct slob_page *sp)
 	__ClearPageSlobPage((struct page *)sp);
 }
 
+static inline struct slob_page *slob_page(const void *addr)
+{
+	return (struct slob_page *)virt_to_page(addr);
+}
+
 /*
  * slob_page_free: true for pages on free_slob_pages list.
  */
@@ -230,7 +235,7 @@ static int slob_last(slob_t *s)
 	return !((unsigned long)slob_next(s) & ~PAGE_MASK);
 }
 
-static void *slob_new_page(gfp_t gfp, int order, int node)
+static void *slob_new_pages(gfp_t gfp, int order, int node)
 {
 	void *page;
 
@@ -247,12 +252,17 @@ static void *slob_new_page(gfp_t gfp, int order, int node)
 	return page_address(page);
 }
 
+static void slob_free_pages(void *b, int order)
+{
+	free_pages((unsigned long)b, order);
+}
+
 /*
  * Allocate a slob block within a given slob_page sp.
  */
 static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 {
-	slob_t *prev, *cur, *aligned = 0;
+	slob_t *prev, *cur, *aligned = NULL;
 	int delta = 0, units = SLOB_UNITS(size);
 
 	for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
@@ -349,10 +359,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 
 	/* Not enough space: must allocate a new page */
 	if (!b) {
-		b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);
+		b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
 		if (!b)
-			return 0;
-		sp = (struct slob_page *)virt_to_page(b);
+			return NULL;
+		sp = slob_page(b);
 		set_slob_page(sp);
 
 		spin_lock_irqsave(&slob_lock, flags);
@@ -384,7 +394,7 @@ static void slob_free(void *block, int size)
 		return;
 	BUG_ON(!size);
 
-	sp = (struct slob_page *)virt_to_page(block);
+	sp = slob_page(block);
 	units = SLOB_UNITS(size);
 
 	spin_lock_irqsave(&slob_lock, flags);
@@ -476,7 +486,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 	} else {
 		void *ret;
 
-		ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
+		ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
 		if (ret) {
 			struct page *page;
 			page = virt_to_page(ret);
@@ -494,8 +504,8 @@ void kfree(const void *block)
 	if (unlikely(ZERO_OR_NULL_PTR(block)))
 		return;
 
-	sp = (struct slob_page *)virt_to_page(block);
-	if (slob_page(sp)) {
+	sp = slob_page(block);
+	if (is_slob_page(sp)) {
 		int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 		unsigned int *m = (unsigned int *)(block - align);
 		slob_free(m, *m + align);
@@ -513,8 +523,8 @@ size_t ksize(const void *block)
 	if (unlikely(block == ZERO_SIZE_PTR))
 		return 0;
 
-	sp = (struct slob_page *)virt_to_page(block);
-	if (slob_page(sp)) {
+	sp = slob_page(block);
+	if (is_slob_page(sp)) {
 		int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 		unsigned int *m = (unsigned int *)(block - align);
 		return SLOB_UNITS(*m) * SLOB_UNIT;
@@ -572,7 +582,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 	if (c->size < PAGE_SIZE)
 		b = slob_alloc(c->size, flags, c->align, node);
 	else
-		b = slob_new_page(flags, get_order(c->size), node);
+		b = slob_new_pages(flags, get_order(c->size), node);
 
 	if (c->ctor)
 		c->ctor(b);
@@ -586,7 +596,7 @@ static void __kmem_cache_free(void *b, int size)
 	if (size < PAGE_SIZE)
 		slob_free(b, size);
 	else
-		free_pages((unsigned long)b, get_order(size));
+		slob_free_pages(b, get_order(size));
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
-- 
cgit v1.2.3


From 6146f0d5e47ca4047ffded0fb79b6c25359b386c Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 4 Feb 2009 09:06:57 -0500
Subject: integrity: IMA hooks

This patch replaces the generic integrity hooks, for which IMA registered
itself, with IMA integrity hooks in the appropriate places directly
in the fs directory.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/kernel-parameters.txt |  1 +
 fs/exec.c                           | 10 +++++++++
 fs/file_table.c                     |  2 ++
 fs/inode.c                          | 24 ++++++++++++++------
 fs/namei.c                          |  8 +++++++
 include/linux/ima.h                 | 44 +++++++++++++++++++++++++++++++++++++
 mm/mmap.c                           |  4 ++++
 7 files changed, 86 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/ima.h

(limited to 'mm')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a2d8805c03d5..7c67b94d1823 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -44,6 +44,7 @@ parameter is applicable:
 	FB	The frame buffer device is enabled.
 	HW	Appropriate hardware is enabled.
 	IA-64	IA-64 architecture is enabled.
+	IMA     Integrity measurement architecture is enabled.
 	IOSCHED	More than one I/O scheduler is enabled.
 	IP_PNP	IP DHCP, BOOTP, or RARP is enabled.
 	ISAPNP	ISA PnP code is enabled.
diff --git a/fs/exec.c b/fs/exec.c
index 02d2e120542d..9c789a525cc4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -45,6 +45,7 @@
 #include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -128,6 +129,9 @@ asmlinkage long sys_uselib(const char __user * library)
 		goto exit;
 
 	error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+	if (error)
+		goto exit;
+	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
 
@@ -681,6 +685,9 @@ struct file *open_exec(const char *name)
 		goto out_path_put;
 
 	err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+	if (err)
+		goto out_path_put;
+	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
 
@@ -1207,6 +1214,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	}
 #endif
 	retval = security_bprm_check(bprm);
+	if (retval)
+		return retval;
+	retval = ima_bprm_check(bprm);
 	if (retval)
 		return retval;
 
diff --git a/fs/file_table.c b/fs/file_table.c
index 0fbcacc3ea75..55895ccc08c6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
@@ -276,6 +277,7 @@ void __fput(struct file *file)
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
+	ima_file_free(file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
diff --git a/fs/inode.c b/fs/inode.c
index 098a2443196f..ed22b14f2202 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
@@ -144,13 +145,13 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_cdev = NULL;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
-	if (security_inode_alloc(inode)) {
-		if (inode->i_sb->s_op->destroy_inode)
-			inode->i_sb->s_op->destroy_inode(inode);
-		else
-			kmem_cache_free(inode_cachep, (inode));
-		return NULL;
-	}
+
+	if (security_inode_alloc(inode))
+		goto out_free_inode;
+
+	/* allocate and initialize an i_integrity */
+	if (ima_inode_alloc(inode))
+		goto out_free_security;
 
 	spin_lock_init(&inode->i_lock);
 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -186,6 +187,15 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_mapping = mapping;
 
 	return inode;
+
+out_free_security:
+	security_inode_free(inode);
+out_free_inode:
+	if (inode->i_sb->s_op->destroy_inode)
+		inode->i_sb->s_op->destroy_inode(inode);
+	else
+		kmem_cache_free(inode_cachep, (inode));
+	return NULL;
 }
 EXPORT_SYMBOL(inode_init_always);
 
diff --git a/fs/namei.c b/fs/namei.c
index af3783fff1de..734f2b5591bf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -24,6 +24,7 @@
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -860,6 +861,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		err = exec_permission_lite(inode);
 		if (err == -EAGAIN)
 			err = vfs_permission(nd, MAY_EXEC);
+		if (!err)
+			err = ima_path_check(&nd->path, MAY_EXEC);
  		if (err)
 			break;
 
@@ -1525,6 +1528,11 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	error = vfs_permission(nd, acc_mode);
 	if (error)
 		return error;
+
+	error = ima_path_check(&nd->path,
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+	if (error)
+		return error;
 	/*
 	 * An append-only file must be opened in append mode for writing.
 	 */
diff --git a/include/linux/ima.h b/include/linux/ima.h
new file mode 100644
index 000000000000..4ed1e4d962e2
--- /dev/null
+++ b/include/linux/ima.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2008 IBM Corporation
+ * Author: Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 2 of the License.
+ */
+
+#include <linux/fs.h>
+
+#ifndef _LINUX_IMA_H
+#define _LINUX_IMA_H
+
+static inline int ima_bprm_check(struct linux_binprm *bprm)
+{
+	return 0;
+}
+
+static inline int ima_inode_alloc(struct inode *inode)
+{
+	return 0;
+}
+
+static inline void ima_inode_free(struct inode *inode)
+{
+	return;
+}
+
+static inline int ima_path_check(struct path *path, int mask)
+{
+	return 0;
+}
+
+static inline void ima_file_free(struct file *file)
+{
+	return;
+}
+
+static inline int ima_file_mmap(struct file *file, unsigned long prot)
+{
+	return 0;
+}
+#endif /* _LINUX_IMA_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index d4855a682ab6..c3647f3b0621 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,6 +20,7 @@
 #include <linux/fs.h>
 #include <linux/personality.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/hugetlb.h>
 #include <linux/profile.h>
 #include <linux/module.h>
@@ -1048,6 +1049,9 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	}
 
 	error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+	if (error)
+		return error;
+	error = ima_file_mmap(file, prot);
 	if (error)
 		return error;
 
-- 
cgit v1.2.3


From 1df9f0a73178718969ae47d813b8e7aab2cf073c Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 4 Feb 2009 09:07:02 -0500
Subject: Integrity: IMA file free imbalance

The number of calls to ima_path_check()/ima_file_free()
should be balanced.  An extra call to fput(), indicates
the file could have been accessed without first being
measured.

Although f_count is incremented/decremented in places other
than fget/fput, like fget_light/fput_light and get_file, the
current task must already hold a file refcnt.  The call to
__fput() is delayed until the refcnt becomes 0, resulting
in ima_file_free() flagging any changes.

- add hook to increment opencount for IPC shared memory(SYSV),
  shmat files, and /dev/zero
- moved NULL iint test in opencount_get()

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/ima.h               |  6 ++++++
 ipc/shm.c                         |  3 +++
 mm/shmem.c                        |  2 ++
 security/integrity/ima/ima.h      |  2 ++
 security/integrity/ima/ima_iint.c | 17 ++++++++++++++++
 security/integrity/ima/ima_main.c | 42 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 72 insertions(+)

(limited to 'mm')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index dcc3664feee8..6db30a328d98 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -19,6 +19,7 @@ extern void ima_inode_free(struct inode *inode);
 extern int ima_path_check(struct path *path, int mask);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
+extern void ima_shm_check(struct file *file);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -50,5 +51,10 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
 {
 	return 0;
 }
+
+static inline void ima_shm_check(struct file *file)
+{
+	return;
+}
 #endif /* CONFIG_IMA_H */
 #endif /* _LINUX_IMA_H */
diff --git a/ipc/shm.c b/ipc/shm.c
index 38a055758a9b..d39bd7637b1c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -39,6 +39,7 @@
 #include <linux/nsproxy.h>
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -381,6 +382,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto no_file;
+	ima_shm_check(file);
 
 	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (id < 0) {
@@ -888,6 +890,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
 	file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
 	if (!file)
 		goto out_free;
+	ima_shm_check(file);
 
 	file->private_data = sfd;
 	file->f_mapping = shp->shm_file->f_mapping;
diff --git a/mm/shmem.c b/mm/shmem.c
index f1b0d4871f3a..dd5588f5d939 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -51,6 +51,7 @@
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 #include <asm/div64.h>
@@ -2600,6 +2601,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
+	ima_shm_check(file);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = file;
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 42706b554921..e3c16a21a38e 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -97,6 +97,7 @@ static inline unsigned long ima_hash_key(u8 *digest)
 
 /* iint cache flags */
 #define IMA_MEASURED		1
+#define IMA_IINT_DUMP_STACK	512
 
 /* integrity data associated with an inode */
 struct ima_iint_cache {
@@ -106,6 +107,7 @@ struct ima_iint_cache {
 	struct mutex mutex;	/* protects: version, flags, digest */
 	long readcount;		/* measured files readcount */
 	long writecount;	/* measured files writecount */
+	long opencount;		/* opens reference count */
 	struct kref refcount;	/* ima_iint_cache reference count */
 	struct rcu_head rcu;
 };
diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c
index 750db3c993a7..1f035e8d29c7 100644
--- a/security/integrity/ima/ima_iint.c
+++ b/security/integrity/ima/ima_iint.c
@@ -126,6 +126,7 @@ struct ima_iint_cache *ima_iint_find_insert_get(struct inode *inode)
 
 	return iint;
 }
+EXPORT_SYMBOL_GPL(ima_iint_find_insert_get);
 
 /* iint_free - called when the iint refcount goes to zero */
 void iint_free(struct kref *kref)
@@ -134,6 +135,21 @@ void iint_free(struct kref *kref)
 						   refcount);
 	iint->version = 0;
 	iint->flags = 0UL;
+	if (iint->readcount != 0) {
+		printk(KERN_INFO "%s: readcount: %ld\n", __FUNCTION__,
+		       iint->readcount);
+		iint->readcount = 0;
+	}
+	if (iint->writecount != 0) {
+		printk(KERN_INFO "%s: writecount: %ld\n", __FUNCTION__,
+		       iint->writecount);
+		iint->writecount = 0;
+	}
+	if (iint->opencount != 0) {
+		printk(KERN_INFO "%s: opencount: %ld\n", __FUNCTION__,
+		       iint->opencount);
+		iint->opencount = 0;
+	}
 	kref_set(&iint->refcount, 1);
 	kmem_cache_free(iint_cache, iint);
 }
@@ -174,6 +190,7 @@ static void init_once(void *foo)
 	mutex_init(&iint->mutex);
 	iint->readcount = 0;
 	iint->writecount = 0;
+	iint->opencount = 0;
 	kref_set(&iint->refcount, 1);
 }
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 871e356e8d6c..f4e7266f5aee 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -66,6 +66,19 @@ void ima_file_free(struct file *file)
 		return;
 
 	mutex_lock(&iint->mutex);
+	if (iint->opencount <= 0) {
+		printk(KERN_INFO
+		       "%s: %s open/free imbalance (r:%ld w:%ld o:%ld f:%ld)\n",
+		       __FUNCTION__, file->f_dentry->d_name.name,
+		       iint->readcount, iint->writecount,
+		       iint->opencount, atomic_long_read(&file->f_count));
+		if (!(iint->flags & IMA_IINT_DUMP_STACK)) {
+			dump_stack();
+			iint->flags |= IMA_IINT_DUMP_STACK;
+		}
+	}
+	iint->opencount--;
+
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		iint->readcount--;
 
@@ -119,6 +132,7 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
 		pr_info("%s dentry_open failed\n", filename);
 		return rc;
 	}
+	iint->opencount++;
 	iint->readcount++;
 
 	rc = ima_collect_measurement(iint, file);
@@ -159,6 +173,7 @@ int ima_path_check(struct path *path, int mask)
 		return 0;
 
 	mutex_lock(&iint->mutex);
+	iint->opencount++;
 	if ((mask & MAY_WRITE) || (mask == 0))
 		iint->writecount++;
 	else if (mask & (MAY_READ | MAY_EXEC))
@@ -219,6 +234,21 @@ out:
 	return rc;
 }
 
+static void opencount_get(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ima_iint_cache *iint;
+
+	if (!ima_initialized || !S_ISREG(inode->i_mode))
+		return;
+	iint = ima_iint_find_insert_get(inode);
+	if (!iint)
+		return;
+	mutex_lock(&iint->mutex);
+	iint->opencount++;
+	mutex_unlock(&iint->mutex);
+}
+
 /**
  * ima_file_mmap - based on policy, collect/store measurement.
  * @file: pointer to the file to be measured (May be NULL)
@@ -242,6 +272,18 @@ int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
+/*
+ * ima_shm_check - IPC shm and shmat create/fput a file
+ *
+ * Maintain the opencount for these files to prevent unnecessary
+ * imbalance messages.
+ */
+void ima_shm_check(struct file *file)
+{
+	opencount_get(file);
+	return;
+}
+
 /**
  * ima_bprm_check - based on policy, collect/store measurement.
  * @bprm: contains the linux_binprm structure
-- 
cgit v1.2.3


From ed850a52af971528b048812c4215cef298af0d3b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 10 Feb 2009 23:01:19 -0500
Subject: integrity: shmem zero fix

Based on comments from Mike Frysinger and Randy Dunlap:
(http://lkml.org/lkml/2009/2/9/262)
- moved ima.h include before CONFIG_SHMEM test to fix compiler error
  on Blackfin:
mm/shmem.c: In function 'shmem_zero_setup':
mm/shmem.c:2670: error: implicit declaration of function 'ima_shm_check'

- added 'struct linux_binprm' in ima.h to fix compiler warning on Blackfin:
In file included from mm/shmem.c:32:
include/linux/ima.h:25: warning: 'struct linux_binprm' declared inside
parameter list
include/linux/ima.h:25: warning: its scope is only this definition or
declaration, which is probably not what you want

- moved fs.h include within _LINUX_IMA_H definition

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/ima.h | 5 +++--
 mm/shmem.c          | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 6db30a328d98..0e2aa45cb0ce 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -7,11 +7,12 @@
  * the Free Software Foundation, version 2 of the License.
  */
 
-#include <linux/fs.h>
-
 #ifndef _LINUX_IMA_H
 #define _LINUX_IMA_H
 
+#include <linux/fs.h>
+struct linux_binprm;
+
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_inode_alloc(struct inode *inode);
diff --git a/mm/shmem.c b/mm/shmem.c
index 75199888a6bd..8135fac294ee 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/ima.h>
 
 static struct vfsmount *shm_mnt;
 
@@ -59,7 +60,6 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
-#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 #include <asm/div64.h>
-- 
cgit v1.2.3


From 734269521e320ad14ed39ae9b64d482b9028dcd2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:07 +0900
Subject: vmalloc: call flush_cache_vunmap() from unmap_kernel_range()

Impact: proper vcache flush on unmap_kernel_range()

flush_cache_vunmap() should be called before pages are unmapped.  Add
a call to it in unmap_kernel_range().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/vmalloc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 75f49d312e8c..c37924a2ee36 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1012,6 +1012,8 @@ void __init vmalloc_init(void)
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
 	unsigned long end = addr + size;
+
+	flush_cache_vunmap(addr, end);
 	vunmap_page_range(addr, end);
 	flush_tlb_kernel_range(addr, end);
 }
-- 
cgit v1.2.3


From f2a8205c4ef1af917d175c36a4097ae5587791c8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: percpu: kill percpu_alloc() and friends

Impact: kill unused functions

percpu_alloc() and its friends never saw much action.  It was supposed
to replace the cpu-mask unaware __alloc_percpu() but it never happened
and in fact __percpu_alloc_mask() itself never really grew proper
up/down handling interface either (no exported interface for
populate/depopulate).

percpu allocation is about to go through major reimplementation and
there's no reason to carry this unused interface around.  Replace it
with __alloc_percpu() and free_percpu().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h | 47 ++++++++++++++++++++++-------------------------
 mm/allocpercpu.c       | 32 +++++++++++++++++++-------------
 2 files changed, 41 insertions(+), 38 deletions(-)

(limited to 'mm')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1fdaee93c04d..d99e24ae1811 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -82,46 +82,43 @@ struct percpu_data {
 
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
 
-extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
-extern void percpu_free(void *__pdata);
+/*
+ * Use this to get to a cpu's version of the per-cpu object
+ * dynamically allocated. Non-atomic access to the current CPU's
+ * version should probably be combined with get_cpu()/put_cpu().
+ */
+#define per_cpu_ptr(ptr, cpu)						\
+({									\
+        struct percpu_data *__p = __percpu_disguise(ptr);		\
+        (__typeof__(ptr))__p->ptrs[(cpu)];				\
+})
+
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void free_percpu(void *__pdata);
 
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+static inline void *__alloc_percpu(size_t size, size_t align)
 {
+	/*
+	 * Can't easily make larger alignment work with kmalloc.  WARN
+	 * on it.  Larger alignment should only be used for module
+	 * percpu sections on SMP for which this path isn't used.
+	 */
+	WARN_ON_ONCE(align > __alignof__(unsigned long long));
 	return kzalloc(size, gfp);
 }
 
-static inline void percpu_free(void *__pdata)
+static inline void free_percpu(void *p)
 {
-	kfree(__pdata);
+	kfree(p);
 }
 
 #endif /* CONFIG_SMP */
 
-#define percpu_alloc_mask(size, gfp, mask) \
-	__percpu_alloc_mask((size), (gfp), &(mask))
-
-#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
-
-/* (legacy) interface for use without CPU hotplug handling */
-
-#define __alloc_percpu(size, align)	percpu_alloc_mask((size), GFP_KERNEL, \
-						  cpu_possible_map)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
 						       __alignof__(type))
-#define free_percpu(ptr)	percpu_free((ptr))
-/*
- * Use this to get to a cpu's version of the per-cpu object dynamically
- * allocated. Non-atomic access to the current CPU's version should
- * probably be combined with get_cpu()/put_cpu().
- */
-#define per_cpu_ptr(ptr, cpu)						\
-({									\
-        struct percpu_data *__p = __percpu_disguise(ptr);		\
-        (__typeof__(ptr))__p->ptrs[(cpu)];				\
-})
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 4297bc41bfd2..3653c570232b 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
 	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 
 /**
- * percpu_alloc_mask - initial setup of per-cpu data
+ * alloc_percpu - initial setup of per-cpu data
  * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-data for cpu's selected through mask bits
+ * @align: alignment
  *
- * Populating per-cpu data for all online cpu's would be a typical use case,
- * which is simplified by the percpu_alloc() wrapper.
- * Per-cpu objects are populated with zeroed buffers.
+ * Allocate dynamic percpu area.  Percpu objects are populated with
+ * zeroed buffers.
  */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+void *__alloc_percpu(size_t size, size_t align)
 {
 	/*
 	 * We allocate whole cache lines to avoid false sharing
 	 */
 	size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
-	void *pdata = kzalloc(sz, gfp);
+	void *pdata = kzalloc(sz, GFP_KERNEL);
 	void *__pdata = __percpu_disguise(pdata);
 
+	/*
+	 * Can't easily make larger alignment work with kmalloc.  WARN
+	 * on it.  Larger alignment should only be used for module
+	 * percpu sections on SMP for which this path isn't used.
+	 */
+	WARN_ON_ONCE(align > __alignof__(unsigned long long));
+
 	if (unlikely(!pdata))
 		return NULL;
-	if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+	if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
+					   &cpu_possible_map)))
 		return __pdata;
 	kfree(pdata);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+EXPORT_SYMBOL_GPL(__alloc_percpu);
 
 /**
- * percpu_free - final cleanup of per-cpu data
+ * free_percpu - final cleanup of per-cpu data
  * @__pdata: object to clean up
  *
  * We simply clean up any per-cpu object left. No need for the client to
  * track and specify through a bis mask which per-cpu objects are to free.
  */
-void percpu_free(void *__pdata)
+void free_percpu(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
 	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
 	kfree(__percpu_disguise(__pdata));
 }
-EXPORT_SYMBOL_GPL(percpu_free);
+EXPORT_SYMBOL_GPL(free_percpu);
-- 
cgit v1.2.3


From f0aa6617903648077dffe5cfcf7c4458f4610fa7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: vmalloc: implement vm_area_register_early()

Impact: allow multiple early vm areas

There are places where kernel VM area needs to be allocated before
vmalloc is initialized.  This is done by allocating static vm_struct,
initializing several fields and linking it to vmlist and later vmalloc
initialization picking up these from vmlist.  This is currently done
manually and if there's more than one such areas, there's no defined
way to arbitrate who gets which address.

This patch implements vm_area_register_early(), which takes vm_area
struct with flags and size initialized, assigns address to it and puts
it on the vmlist.  This way, multiple early vm areas can determine
which addresses they should use.  The only current user - alpha mm
init - is converted to use it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/alpha/mm/init.c    | 20 +++++++++++++-------
 include/linux/vmalloc.h |  1 +
 mm/vmalloc.c            | 24 ++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 5d7a16eab312..df6df025ded4 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
 
 	if (alpha_using_srm) {
 		static struct vm_struct console_remap_vm;
-		unsigned long vaddr = VMALLOC_START;
+		unsigned long nr_pages = 0;
+		unsigned long vaddr;
 		unsigned long i, j;
 
+		/* calculate needed size */
+		for (i = 0; i < crb->map_entries; ++i)
+			nr_pages += crb->map[i].count;
+
+		/* register the vm area */
+		console_remap_vm.flags = VM_ALLOC;
+		console_remap_vm.size = nr_pages << PAGE_SHIFT;
+		vm_area_register_early(&console_remap_vm);
+
+		vaddr = (unsigned long)consle_remap_vm.addr;
+
 		/* Set up the third level PTEs and update the virtual
 		   addresses of the CRB entries.  */
 		for (i = 0; i < crb->map_entries; ++i) {
@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
 				vaddr += PAGE_SIZE;
 			}
 		}
-
-		/* Let vmalloc know that we've allocated some space.  */
-		console_remap_vm.flags = VM_ALLOC;
-		console_remap_vm.addr = (void *) VMALLOC_START;
-		console_remap_vm.size = vaddr - VMALLOC_START;
-		vmlist = &console_remap_vm;
 	}
 
 	callback_init_done = 1;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 506e7620a986..bbc051392298 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -106,5 +106,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
  */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
+extern __init void vm_area_register_early(struct vm_struct *vm);
 
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c37924a2ee36..d206261ad9ef 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,7 @@
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
+#include <linux/pfn.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -982,6 +983,29 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 }
 EXPORT_SYMBOL(vm_map_ram);
 
+/**
+ * vm_area_register_early - register vmap area early during boot
+ * @vm: vm_struct to register
+ * @size: size of area to register
+ *
+ * This function is used to register kernel vm area before
+ * vmalloc_init() is called.  @vm->size and @vm->flags should contain
+ * proper values on entry and other fields should be zero.  On return,
+ * vm->addr contains the allocated address.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_register_early(struct vm_struct *vm)
+{
+	static size_t vm_init_off __initdata;
+
+	vm->addr = (void *)VMALLOC_START + vm_init_off;
+	vm_init_off = PFN_ALIGN(vm_init_off + vm->size);
+
+	vm->next = vmlist;
+	vmlist = vm;
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
-- 
cgit v1.2.3


From 8fc48985006da4ceba24508db64ec77fc0dfe3bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: vmalloc: add un/map_kernel_range_noflush()

Impact: two more public map/unmap functions

Implement map_kernel_range_noflush() and unmap_kernel_range_noflush().
These functions respectively map and unmap address range in kernel VM
area but doesn't do any vcache or tlb flushing.  These will be used by
new percpu allocator.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
---
 include/linux/vmalloc.h |  3 +++
 mm/vmalloc.c            | 67 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index bbc051392298..599ba7984310 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -91,6 +91,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
+extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
+				    pgprot_t prot, struct page **pages);
+extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 
 /* Allocate/destroy a 'vmalloc' VM area. */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d206261ad9ef..224eca9650a8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -153,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
  *
  * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
  */
-static int vmap_page_range(unsigned long start, unsigned long end,
-				pgprot_t prot, struct page **pages)
+static int vmap_page_range_noflush(unsigned long start, unsigned long end,
+				   pgprot_t prot, struct page **pages)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -170,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	flush_cache_vmap(start, end);
 
 	if (unlikely(err))
 		return err;
 	return nr;
 }
 
+static int vmap_page_range(unsigned long start, unsigned long end,
+			   pgprot_t prot, struct page **pages)
+{
+	int ret;
+
+	ret = vmap_page_range_noflush(start, end, prot, pages);
+	flush_cache_vmap(start, end);
+	return ret;
+}
+
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@@ -1033,6 +1042,58 @@ void __init vmalloc_init(void)
 	vmap_initialized = true;
 }
 
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vmap() on to-be-mapped areas
+ * before calling this function.
+ *
+ * RETURNS:
+ * The number of pages mapped on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+			     pgprot_t prot, struct page **pages)
+{
+	return vmap_page_range_noflush(addr, addr + size, prot, pages);
+}
+
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vunmap() on to-be-mapped areas
+ * before calling this function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
+{
+	vunmap_page_range(addr, addr + size);
+}
+
+/**
+ * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Similar to unmap_kernel_range_noflush() but flushes vcache before
+ * the unmapping and tlb after.
+ */
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
 	unsigned long end = addr + size;
-- 
cgit v1.2.3


From fbf59bc9d74d1fb30b8e0630743aff2806eafcea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: percpu: implement new dynamic percpu allocator

Impact: new scalable dynamic percpu allocator which allows dynamic
        percpu areas to be accessed the same way as static ones

Implement scalable dynamic percpu allocator which can be used for both
static and dynamic percpu areas.  This will allow static and dynamic
areas to share faster direct access methods.  This feature is optional
and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by
arch.  Please read comment on top of mm/percpu.c for details.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu.h |  22 +-
 kernel/module.c        |  31 ++
 mm/Makefile            |   4 +
 mm/percpu.c            | 890 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 943 insertions(+), 4 deletions(-)
 create mode 100644 mm/percpu.c

(limited to 'mm')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d99e24ae1811..18080995ff3e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -76,23 +76,37 @@
 
 #ifdef CONFIG_SMP
 
-struct percpu_data {
-	void *ptrs[1];
-};
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
-#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+extern void *pcpu_base_addr;
 
+typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+
+extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+				       struct page **pages, size_t cpu_size);
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
  * version should probably be combined with get_cpu()/put_cpu().
  */
+#define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+
+#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+struct percpu_data {
+	void *ptrs[1];
+};
+
+#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+
 #define per_cpu_ptr(ptr, cpu)						\
 ({									\
         struct percpu_data *__p = __percpu_disguise(ptr);		\
         (__typeof__(ptr))__p->ptrs[(cpu)];				\
 })
 
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 extern void *__alloc_percpu(size_t size, size_t align);
 extern void free_percpu(void *__pdata);
 
diff --git a/kernel/module.c b/kernel/module.c
index 52b3497b8748..1f0657ae555b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/percpu.h>
 
 #if 0
 #define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
 }
 
 #ifdef CONFIG_SMP
+
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+			     const char *name)
+{
+	void *ptr;
+
+	if (align > PAGE_SIZE) {
+		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+		       name, align, PAGE_SIZE);
+		align = PAGE_SIZE;
+	}
+
+	ptr = __alloc_percpu(size, align);
+	if (!ptr)
+		printk(KERN_WARNING
+		       "Could not allocate %lu bytes percpu data\n", size);
+	return ptr;
+}
+
+static void percpu_modfree(void *freeme)
+{
+	free_percpu(freeme);
+}
+
+#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
 /* Size of each block.  -ve means used. */
@@ -499,6 +528,8 @@ static int percpu_modinit(void)
 }
 __initcall(percpu_modinit);
 
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
 				 const char *secstrings)
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f89..818569b68f46 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+obj-$(CONFIG_SMP) += percpu.o
+else
 obj-$(CONFIG_SMP) += allocpercpu.o
+endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000000..4617d97e877c
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,890 @@
+/*
+ * linux/mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009		SUSE Linux Products GmbH
+ * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * chunk is consisted of num_possible_cpus() units and the first chunk
+ * is used for static percpu variables in the kernel image (special
+ * boot time alloc/init handling necessary as these areas need to be
+ * brought up before allocation services are running).  Unit grows as
+ * necessary and all units grow or shrink in unison.  When a chunk is
+ * filled up, another chunk is allocated.  ie. in vmalloc area
+ *
+ *  c0                           c1                         c2
+ *  -------------------          -------------------        ------------
+ * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
+ *  -------------------  ......  -------------------  ....  ------------
+ *
+ * Allocation is done in offset-size areas of single unit space.  Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
+ * percpu base registers UNIT_SIZE apart.
+ *
+ * There are usually many small percpu allocations many of them as
+ * small as 4 bytes.  The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be eqaul to or larger than the maximum contiguous
+ * area in the chunk.  This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map.  A positive value in the map represents a free
+ * region and negative allocated.  Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry.  This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks are also linked into a rb tree to ease address to chunk
+ * mapping during free.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ *   regular address to percpu pointer and back
+ *
+ * - use pcpu_setup_static() during percpu area initialization to
+ *   setup kernel static percpu area
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define PCPU_MIN_UNIT_PAGES_SHIFT	4	/* also max alloc size */
+#define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
+
+struct pcpu_chunk {
+	struct list_head	list;		/* linked to pcpu_slot lists */
+	struct rb_node		rb_node;	/* key is chunk->vm->addr */
+	int			free_size;	/* free bytes in the chunk */
+	int			contig_hint;	/* max contiguous size hint */
+	struct vm_struct	*vm;		/* mapped vmalloc region */
+	int			map_used;	/* # of map entries used */
+	int			map_alloc;	/* # of map entries allocated */
+	int			*map;		/* allocation map */
+	struct page		*page[];	/* #cpus * UNIT_PAGES */
+};
+
+static int pcpu_unit_pages_shift;
+static int pcpu_unit_pages;
+static int pcpu_unit_shift;
+static int pcpu_unit_size;
+static int pcpu_chunk_size;
+static int pcpu_nr_slots;
+static size_t pcpu_chunk_struct_size;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+/* the size of kernel static area */
+static int pcpu_static_size;
+
+/*
+ * One mutex to rule them all.
+ *
+ * The following mutex is grabbed in the outermost public alloc/free
+ * interface functions and released only when the operation is
+ * complete.  As such, every function in this file other than the
+ * outermost functions are called under pcpu_mutex.
+ *
+ * It can easily be switched to use spinlock such that only the area
+ * allocation and page population commit are protected with it doing
+ * actual [de]allocation without holding any lock.  However, given
+ * what this allocator does, I think it's better to let them run
+ * sequentially.
+ */
+static DEFINE_MUTEX(pcpu_mutex);
+
+static struct list_head *pcpu_slot;		/* chunk list slots */
+static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
+
+static int pcpu_size_to_slot(int size)
+{
+	int highbit = fls(size);
+	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+		return 0;
+
+	return pcpu_size_to_slot(chunk->free_size);
+}
+
+static int pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+	return (cpu << pcpu_unit_pages_shift) + page_idx;
+}
+
+static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
+				      unsigned int cpu, int page_idx)
+{
+	return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+				     unsigned int cpu, int page_idx)
+{
+	return (unsigned long)chunk->vm->addr +
+		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+}
+
+static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
+				     int page_idx)
+{
+	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+}
+
+/**
+ * pcpu_realloc - versatile realloc
+ * @p: the current pointer (can be NULL for new allocations)
+ * @size: the current size (can be 0 for new allocations)
+ * @new_size: the wanted new size (can be 0 for free)
+ *
+ * More robust realloc which can be used to allocate, resize or free a
+ * memory area of arbitrary size.  If the needed size goes over
+ * PAGE_SIZE, kernel VM is used.
+ *
+ * RETURNS:
+ * The new pointer on success, NULL on failure.
+ */
+static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+{
+	void *new;
+
+	if (new_size <= PAGE_SIZE)
+		new = kmalloc(new_size, GFP_KERNEL);
+	else
+		new = vmalloc(new_size);
+	if (new_size && !new)
+		return NULL;
+
+	memcpy(new, p, min(size, new_size));
+	if (new_size > size)
+		memset(new + size, 0, new_size - size);
+
+	if (size <= PAGE_SIZE)
+		kfree(p);
+	else
+		vfree(p);
+
+	return new;
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+	int nslot = pcpu_chunk_slot(chunk);
+
+	if (oslot != nslot) {
+		if (oslot < nslot)
+			list_move(&chunk->list, &pcpu_slot[nslot]);
+		else
+			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+	}
+}
+
+static struct rb_node **pcpu_chunk_rb_search(void *addr,
+					     struct rb_node **parentp)
+{
+	struct rb_node **p = &pcpu_addr_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pcpu_chunk *chunk;
+
+	while (*p) {
+		parent = *p;
+		chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
+
+		if (addr < chunk->vm->addr)
+			p = &(*p)->rb_left;
+		else if (addr > chunk->vm->addr)
+			p = &(*p)->rb_right;
+		else
+			break;
+	}
+
+	if (parentp)
+		*parentp = parent;
+	return p;
+}
+
+/**
+ * pcpu_chunk_addr_search - search for chunk containing specified address
+ * @addr: address to search for
+ *
+ * Look for chunk which might contain @addr.  More specifically, it
+ * searchs for the chunk with the highest start address which isn't
+ * beyond @addr.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+	struct rb_node *n, *parent;
+	struct pcpu_chunk *chunk;
+
+	n = *pcpu_chunk_rb_search(addr, &parent);
+	if (!n) {
+		/* no exactly matching chunk, the parent is the closest */
+		n = parent;
+		BUG_ON(!n);
+	}
+	chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+
+	if (addr < chunk->vm->addr) {
+		/* the parent was the next one, look for the previous one */
+		n = rb_prev(n);
+		BUG_ON(!n);
+		chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+	}
+
+	return chunk;
+}
+
+/**
+ * pcpu_chunk_addr_insert - insert chunk into address rb tree
+ * @new: chunk to insert
+ *
+ * Insert @new into address rb tree.
+ */
+static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
+{
+	struct rb_node **p, *parent;
+
+	p = pcpu_chunk_rb_search(new->vm->addr, &parent);
+	BUG_ON(*p);
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, &pcpu_addr_root);
+}
+
+/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size (can be 0)
+ * @tail: tail size (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks.  If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+{
+	int nr_extra = !!head + !!tail;
+	int target = chunk->map_used + nr_extra;
+
+	/* reallocation required? */
+	if (chunk->map_alloc < target) {
+		int new_alloc = chunk->map_alloc;
+		int *new;
+
+		while (new_alloc < target)
+			new_alloc *= 2;
+
+		new = pcpu_realloc(chunk->map,
+				   chunk->map_alloc * sizeof(new[0]),
+				   new_alloc * sizeof(new[0]));
+		if (!new)
+			return -ENOMEM;
+
+		chunk->map_alloc = new_alloc;
+		chunk->map = new;
+	}
+
+	/* insert a new subblock */
+	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+		sizeof(chunk->map[0]) * (chunk->map_used - i));
+	chunk->map_used += nr_extra;
+
+	if (head) {
+		chunk->map[i + 1] = chunk->map[i] - head;
+		chunk->map[i++] = head;
+	}
+	if (tail) {
+		chunk->map[i++] -= tail;
+		chunk->map[i] = tail;
+	}
+	return 0;
+}
+
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset.  It doesn't
+ * populate or map the area.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -errno on failure.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+	int oslot = pcpu_chunk_slot(chunk);
+	int max_contig = 0;
+	int i, off;
+
+	/*
+	 * The static chunk initially doesn't have map attached
+	 * because kmalloc wasn't available during init.  Give it one.
+	 */
+	if (unlikely(!chunk->map)) {
+		chunk->map = pcpu_realloc(NULL, 0,
+				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+		if (!chunk->map)
+			return -ENOMEM;
+
+		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+		chunk->map[chunk->map_used++] = -pcpu_static_size;
+		if (chunk->free_size)
+			chunk->map[chunk->map_used++] = chunk->free_size;
+	}
+
+	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+		bool is_last = i + 1 == chunk->map_used;
+		int head, tail;
+
+		/* extra for alignment requirement */
+		head = ALIGN(off, align) - off;
+		BUG_ON(i == 0 && head != 0);
+
+		if (chunk->map[i] < 0)
+			continue;
+		if (chunk->map[i] < head + size) {
+			max_contig = max(chunk->map[i], max_contig);
+			continue;
+		}
+
+		/*
+		 * If head is small or the previous block is free,
+		 * merge'em.  Note that 'small' is defined as smaller
+		 * than sizeof(int), which is very small but isn't too
+		 * uncommon for percpu allocations.
+		 */
+		if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+			if (chunk->map[i - 1] > 0)
+				chunk->map[i - 1] += head;
+			else {
+				chunk->map[i - 1] -= head;
+				chunk->free_size -= head;
+			}
+			chunk->map[i] -= head;
+			off += head;
+			head = 0;
+		}
+
+		/* if tail is small, just keep it around */
+		tail = chunk->map[i] - head - size;
+		if (tail < sizeof(int))
+			tail = 0;
+
+		/* split if warranted */
+		if (head || tail) {
+			if (pcpu_split_block(chunk, i, head, tail))
+				return -ENOMEM;
+			if (head) {
+				i++;
+				off += head;
+				max_contig = max(chunk->map[i - 1], max_contig);
+			}
+			if (tail)
+				max_contig = max(chunk->map[i + 1], max_contig);
+		}
+
+		/* update hint and mark allocated */
+		if (is_last)
+			chunk->contig_hint = max_contig; /* fully scanned */
+		else
+			chunk->contig_hint = max(chunk->contig_hint,
+						 max_contig);
+
+		chunk->free_size -= chunk->map[i];
+		chunk->map[i] = -chunk->map[i];
+
+		pcpu_chunk_relocate(chunk, oslot);
+		return off;
+	}
+
+	chunk->contig_hint = max_contig;	/* fully scanned */
+	pcpu_chunk_relocate(chunk, oslot);
+
+	/*
+	 * Tell the upper layer that this chunk has no area left.
+	 * Note that this is not an error condition but a notification
+	 * to upper layer that it needs to look at other chunks.
+	 * -ENOSPC is chosen as it isn't used in memory subsystem and
+	 * matches the meaning in a way.
+	 */
+	return -ENOSPC;
+}
+
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk.  Note that this function
+ * only modifies the allocation map.  It doesn't depopulate or unmap
+ * the area.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+	int oslot = pcpu_chunk_slot(chunk);
+	int i, off;
+
+	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+		if (off == freeme)
+			break;
+	BUG_ON(off != freeme);
+	BUG_ON(chunk->map[i] > 0);
+
+	chunk->map[i] = -chunk->map[i];
+	chunk->free_size += chunk->map[i];
+
+	/* merge with previous? */
+	if (i > 0 && chunk->map[i - 1] >= 0) {
+		chunk->map[i - 1] += chunk->map[i];
+		chunk->map_used--;
+		memmove(&chunk->map[i], &chunk->map[i + 1],
+			(chunk->map_used - i) * sizeof(chunk->map[0]));
+		i--;
+	}
+	/* merge with next? */
+	if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+		chunk->map[i] += chunk->map[i + 1];
+		chunk->map_used--;
+		memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+			(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+	}
+
+	chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+	pcpu_chunk_relocate(chunk, oslot);
+}
+
+/**
+ * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * If @flush is true, vcache is flushed before unmapping and tlb
+ * after.
+ */
+static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
+		       bool flush)
+{
+	unsigned int last = num_possible_cpus() - 1;
+	unsigned int cpu;
+
+	/*
+	 * Each flushing trial can be very expensive, issue flush on
+	 * the whole region at once rather than doing it for each cpu.
+	 * This could be an overkill but is more scalable.
+	 */
+	if (flush)
+		flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+				   pcpu_chunk_addr(chunk, last, page_end));
+
+	for_each_possible_cpu(cpu)
+		unmap_kernel_range_noflush(
+				pcpu_chunk_addr(chunk, cpu, page_start),
+				(page_end - page_start) << PAGE_SHIFT);
+
+	/* ditto as flush_cache_vunmap() */
+	if (flush)
+		flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+				       pcpu_chunk_addr(chunk, last, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off,
+				  size_t size, bool flush)
+{
+	int page_start = PFN_DOWN(off);
+	int page_end = PFN_UP(off + size);
+	int unmap_start = -1;
+	int uninitialized_var(unmap_end);
+	unsigned int cpu;
+	int i;
+
+	for (i = page_start; i < page_end; i++) {
+		for_each_possible_cpu(cpu) {
+			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+			if (!*pagep)
+				continue;
+
+			__free_page(*pagep);
+
+			/*
+			 * If it's partial depopulation, it might get
+			 * populated or depopulated again.  Mark the
+			 * page gone.
+			 */
+			*pagep = NULL;
+
+			unmap_start = unmap_start < 0 ? i : unmap_start;
+			unmap_end = i + 1;
+		}
+	}
+
+	if (unmap_start >= 0)
+		pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+}
+
+/**
+ * pcpu_map - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.
+ * vcache is flushed afterwards.
+ */
+static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+	unsigned int cpu;
+	int err;
+
+	for_each_possible_cpu(cpu) {
+		err = map_kernel_range_noflush(
+				pcpu_chunk_addr(chunk, cpu, page_start),
+				(page_end - page_start) << PAGE_SHIFT,
+				PAGE_KERNEL,
+				pcpu_chunk_pagep(chunk, cpu, page_start));
+		if (err < 0)
+			return err;
+	}
+
+	/* flush at once, please read comments in pcpu_unmap() */
+	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
+			 pcpu_chunk_addr(chunk, last, page_end));
+	return 0;
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+	const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	int page_start = PFN_DOWN(off);
+	int page_end = PFN_UP(off + size);
+	int map_start = -1;
+	int map_end;
+	unsigned int cpu;
+	int i;
+
+	for (i = page_start; i < page_end; i++) {
+		if (pcpu_chunk_page_occupied(chunk, i)) {
+			if (map_start >= 0) {
+				if (pcpu_map(chunk, map_start, map_end))
+					goto err;
+				map_start = -1;
+			}
+			continue;
+		}
+
+		map_start = map_start < 0 ? i : map_start;
+		map_end = i + 1;
+
+		for_each_possible_cpu(cpu) {
+			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+			*pagep = alloc_pages_node(cpu_to_node(cpu),
+						  alloc_mask, 0);
+			if (!*pagep)
+				goto err;
+		}
+	}
+
+	if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
+		goto err;
+
+	for_each_possible_cpu(cpu)
+		memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0,
+		       size);
+
+	return 0;
+err:
+	/* likely under heavy memory pressure, give memory back */
+	pcpu_depopulate_chunk(chunk, off, size, true);
+	return -ENOMEM;
+}
+
+static void free_pcpu_chunk(struct pcpu_chunk *chunk)
+{
+	if (!chunk)
+		return;
+	if (chunk->vm)
+		free_vm_area(chunk->vm);
+	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+	kfree(chunk);
+}
+
+static struct pcpu_chunk *alloc_pcpu_chunk(void)
+{
+	struct pcpu_chunk *chunk;
+
+	chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	chunk->map = pcpu_realloc(NULL, 0,
+				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+	chunk->map[chunk->map_used++] = pcpu_unit_size;
+
+	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+	if (!chunk->vm) {
+		free_pcpu_chunk(chunk);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&chunk->list);
+	chunk->free_size = pcpu_unit_size;
+	chunk->contig_hint = pcpu_unit_size;
+
+	return chunk;
+}
+
+/**
+ * __alloc_percpu - allocate percpu area
+ * @size: size of area to allocate
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	void *ptr = NULL;
+	struct pcpu_chunk *chunk;
+	int slot, off;
+
+	if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT ||
+		     align > PAGE_SIZE)) {
+		WARN(true, "illegal size (%zu) or align (%zu) for "
+		     "percpu allocation\n", size, align);
+		return NULL;
+	}
+
+	mutex_lock(&pcpu_mutex);
+
+	/* allocate area */
+	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (size > chunk->contig_hint)
+				continue;
+			off = pcpu_alloc_area(chunk, size, align);
+			if (off >= 0)
+				goto area_found;
+			if (off != -ENOSPC)
+				goto out_unlock;
+		}
+	}
+
+	/* hmmm... no space left, create a new chunk */
+	chunk = alloc_pcpu_chunk();
+	if (!chunk)
+		goto out_unlock;
+	pcpu_chunk_relocate(chunk, -1);
+	pcpu_chunk_addr_insert(chunk);
+
+	off = pcpu_alloc_area(chunk, size, align);
+	if (off < 0)
+		goto out_unlock;
+
+area_found:
+	/* populate, map and clear the area */
+	if (pcpu_populate_chunk(chunk, off, size)) {
+		pcpu_free_area(chunk, off);
+		goto out_unlock;
+	}
+
+	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
+out_unlock:
+	mutex_unlock(&pcpu_mutex);
+	return ptr;
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+{
+	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+	list_del(&chunk->list);
+	rb_erase(&chunk->rb_node, &pcpu_addr_root);
+	free_pcpu_chunk(chunk);
+}
+
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.  Might sleep.
+ */
+void free_percpu(void *ptr)
+{
+	void *addr = __pcpu_ptr_to_addr(ptr);
+	struct pcpu_chunk *chunk;
+	int off;
+
+	if (!ptr)
+		return;
+
+	mutex_lock(&pcpu_mutex);
+
+	chunk = pcpu_chunk_addr_search(addr);
+	off = addr - chunk->vm->addr;
+
+	pcpu_free_area(chunk, off);
+
+	/* the chunk became fully free, kill one if there are other free ones */
+	if (chunk->free_size == pcpu_unit_size) {
+		struct pcpu_chunk *pos;
+
+		list_for_each_entry(pos,
+				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+			if (pos != chunk) {
+				pcpu_kill_chunk(pos);
+				break;
+			}
+	}
+
+	mutex_unlock(&pcpu_mutex);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+/**
+ * pcpu_setup_static - initialize kernel static percpu area
+ * @populate_pte_fn: callback to allocate pagetable
+ * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
+ *
+ * Initialize kernel static percpu area.  The caller should allocate
+ * all the necessary pages and pass them in @pages.
+ * @populate_pte_fn() is called on each page to be used for percpu
+ * mapping and is responsible for making sure all the necessary page
+ * tables for the page is allocated.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access.
+ */
+size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+				struct page **pages, size_t cpu_size)
+{
+	static struct vm_struct static_vm;
+	struct pcpu_chunk *static_chunk;
+	int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
+	unsigned int cpu;
+	int err, i;
+
+	pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT,
+				      order_base_2(cpu_size) - PAGE_SHIFT);
+
+	pcpu_static_size = cpu_size;
+	pcpu_unit_pages = 1 << pcpu_unit_pages_shift;
+	pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift;
+	pcpu_unit_size = 1 << pcpu_unit_shift;
+	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+	pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
+	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+		+ (1 << pcpu_unit_pages_shift) * sizeof(struct page *);
+
+	/* allocate chunk slots */
+	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+	for (i = 0; i < pcpu_nr_slots; i++)
+		INIT_LIST_HEAD(&pcpu_slot[i]);
+
+	/* init and register vm area */
+	static_vm.flags = VM_ALLOC;
+	static_vm.size = pcpu_chunk_size;
+	vm_area_register_early(&static_vm);
+
+	/* init static_chunk */
+	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
+	INIT_LIST_HEAD(&static_chunk->list);
+	static_chunk->vm = &static_vm;
+	static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+	static_chunk->contig_hint = static_chunk->free_size;
+
+	/* assign pages and map them */
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < nr_cpu_pages; i++) {
+			*pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
+			populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
+		}
+	}
+
+	err = pcpu_map(static_chunk, 0, nr_cpu_pages);
+	if (err)
+		panic("failed to setup static percpu area, err=%d\n", err);
+
+	/* link static_chunk in */
+	pcpu_chunk_relocate(static_chunk, -1);
+	pcpu_chunk_addr_insert(static_chunk);
+
+	/* we're done */
+	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+	return pcpu_unit_size;
+}
-- 
cgit v1.2.3


From ffadd4d0feb5376c82dc3a4104731b7ce2794edc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Tue, 17 Feb 2009 12:05:07 -0500
Subject: SLUB: Introduce and use SLUB_MAX_SIZE and SLUB_PAGE_SHIFT constants

As a preparational patch to bump up page allocator pass-through threshold,
introduce two new constants SLUB_MAX_SIZE and SLUB_PAGE_SHIFT and convert
mm/slub.c to use them.

Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Tested-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h | 19 ++++++++++++++++---
 mm/slub.c                | 16 ++++++++--------
 2 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..986e09dcfd8f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -120,11 +120,24 @@ struct kmem_cache {
 
 #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
 
+/*
+ * Maximum kmalloc object size handled by SLUB. Larger object allocations
+ * are passed through to the page allocator. The page allocator "fastpath"
+ * is relatively slow so we need this value sufficiently high so that
+ * performance critical objects are allocated through the SLUB fastpath.
+ *
+ * This should be dropped to PAGE_SIZE / 2 once the page allocator
+ * "fastpath" becomes competitive with the slab allocator fastpaths.
+ */
+#define SLUB_MAX_SIZE (PAGE_SIZE)
+
+#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1)
+
 /*
  * We keep the general caches in an array of slab caches that are used for
  * 2^x bytes of allocations.
  */
-extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1];
+extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT];
 
 /*
  * Sorry that the following has to be that ugly but some versions of GCC
@@ -212,7 +225,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
-		if (size > PAGE_SIZE)
+		if (size > SLUB_MAX_SIZE)
 			return kmalloc_large(size, flags);
 
 		if (!(flags & SLUB_DMA)) {
@@ -234,7 +247,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) &&
-		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
+		size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
 
 		if (!s)
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a23..5a5e7f5bf799 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
 static int __init setup_slub_min_order(char *str)
@@ -2537,7 +2537,7 @@ panic:
 }
 
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
+static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
 
 static void sysfs_add_func(struct work_struct *w)
 {
@@ -2658,7 +2658,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -2686,7 +2686,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large_node(size, flags, node);
 
 	s = get_slab(size, flags);
@@ -2985,7 +2985,7 @@ void __init kmem_cache_init(void)
 		caches++;
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
 			"kmalloc", 1 << i, GFP_KERNEL);
 		caches++;
@@ -3022,7 +3022,7 @@ void __init kmem_cache_init(void)
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
 			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 
@@ -3222,7 +3222,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);
@@ -3238,7 +3238,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large_node(size, gfpflags, node);
 
 	s = get_slab(size, gfpflags);
-- 
cgit v1.2.3


From e8120ff1ffc51102ead1f4c98a3fd5d26fefc722 Mon Sep 17 00:00:00 2001
From: Zhang Yanmin <yanmin.zhang@linux.intel.com>
Date: Thu, 12 Feb 2009 18:00:17 +0200
Subject: SLUB: Fix default slab order for big object sizes

The default order of kmalloc-8192 on 2*4 stoakley is an issue of
calculate_order.

slab_size       order           name
-------------------------------------------------
4096            3               sgpool-128
8192            2               kmalloc-8192
16384           3               kmalloc-16384

kmalloc-8192's default order is smaller than sgpool-128's.

On 4*4 tigerton machine, a similiar issue appears on another kmem_cache.

Function calculate_order uses 'min_objects /= 2;' to shrink. Plus size
calculation/checking in slab_order, sometimes above issue appear.

Below patch against 2.6.29-rc2 fixes it.

I checked the default orders of all kmem_cache and they don't become
smaller than before. So the patch wouldn't hurt performance.

Signed-off-by Zhang Yanmin <yanmin.zhang@linux.intel.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 5a5e7f5bf799..c01a7a3001d2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1844,6 +1844,7 @@ static inline int calculate_order(int size)
 	int order;
 	int min_objects;
 	int fraction;
+	int max_objects;
 
 	/*
 	 * Attempt to find best configuration for a slab. This
@@ -1856,6 +1857,9 @@ static inline int calculate_order(int size)
 	min_objects = slub_min_objects;
 	if (!min_objects)
 		min_objects = 4 * (fls(nr_cpu_ids) + 1);
+	max_objects = (PAGE_SIZE << slub_max_order)/size;
+	min_objects = min(min_objects, max_objects);
+
 	while (min_objects > 1) {
 		fraction = 16;
 		while (fraction >= 4) {
@@ -1865,7 +1869,7 @@ static inline int calculate_order(int size)
 				return order;
 			fraction /= 2;
 		}
-		min_objects /= 2;
+		min_objects --;
 	}
 
 	/*
-- 
cgit v1.2.3


From cae3aeb83fef5a7c9c8ac40e653e59dd9a35469c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 21 Feb 2009 16:56:23 +0900
Subject: percpu: clean up size usage

Andrew was concerned about the unit of variables named or have suffix
size.  Every usage in percpu allocator is in bytes but make it super
clear by adding comments.

While at it, make pcpu_depopulate_chunk() take int @off and @size like
everyone else.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 mm/percpu.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 4617d97e877c..997724c2ea24 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -119,7 +119,7 @@ static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
 static int pcpu_size_to_slot(int size)
 {
-	int highbit = fls(size);
+	int highbit = fls(size);	/* size is in bytes */
 	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 }
 
@@ -158,8 +158,8 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 /**
  * pcpu_realloc - versatile realloc
  * @p: the current pointer (can be NULL for new allocations)
- * @size: the current size (can be 0 for new allocations)
- * @new_size: the wanted new size (can be 0 for free)
+ * @size: the current size in bytes (can be 0 for new allocations)
+ * @new_size: the wanted new size in bytes (can be 0 for free)
  *
  * More robust realloc which can be used to allocate, resize or free a
  * memory area of arbitrary size.  If the needed size goes over
@@ -290,8 +290,8 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * pcpu_split_block - split a map block
  * @chunk: chunk of interest
  * @i: index of map block to split
- * @head: head size (can be 0)
- * @tail: tail size (can be 0)
+ * @head: head size in bytes (can be 0)
+ * @tail: tail size in bytes (can be 0)
  *
  * Split the @i'th map block into two or three blocks.  If @head is
  * non-zero, @head bytes block is inserted before block @i moving it
@@ -346,7 +346,7 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 /**
  * pcpu_alloc_area - allocate area from a pcpu_chunk
  * @chunk: chunk of interest
- * @size: wanted size
+ * @size: wanted size in bytes
  * @align: wanted align
  *
  * Try to allocate @size bytes area aligned at @align from @chunk.
@@ -540,15 +540,15 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
  * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
  * @chunk: chunk to depopulate
  * @off: offset to the area to depopulate
- * @size: size of the area to depopulate
+ * @size: size of the area to depopulate in bytes
  * @flush: whether to flush cache and tlb or not
  *
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.  If @flush is true, vcache is flushed before unmapping
  * and tlb after.
  */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off,
-				  size_t size, bool flush)
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
+				  bool flush)
 {
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
@@ -617,7 +617,7 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
  * @chunk: chunk of interest
  * @off: offset to the area to populate
- * @size: size of the area to populate
+ * @size: size of the area to populate in bytes
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.  The area is cleared on return.
@@ -707,7 +707,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 
 /**
  * __alloc_percpu - allocate percpu area
- * @size: size of area to allocate
+ * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
  *
  * Allocate percpu area of @size bytes aligned at @align.  Might
@@ -819,6 +819,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_static - initialize kernel static percpu area
  * @populate_pte_fn: callback to allocate pagetable
  * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
+ * @cpu_size: the size of static percpu area in bytes
  *
  * Initialize kernel static percpu area.  The caller should allocate
  * all the necessary pages and pass them in @pages.
-- 
cgit v1.2.3


From 3b89d7d881a1dbb4da158f7eb5d6b3ceefc72810 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Sun, 22 Feb 2009 17:40:07 -0800
Subject: slub: move min_partial to struct kmem_cache

Although it allows for better cacheline use, it is unnecessary to save a
copy of the cache's min_partial value in each kmem_cache_node.

Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h |  2 +-
 mm/slub.c                | 29 ++++++++++++++++-------------
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..f20a89e4d52c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -46,7 +46,6 @@ struct kmem_cache_cpu {
 struct kmem_cache_node {
 	spinlock_t list_lock;	/* Protect partial list and nr_partial */
 	unsigned long nr_partial;
-	unsigned long min_partial;
 	struct list_head partial;
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_t nr_slabs;
@@ -89,6 +88,7 @@ struct kmem_cache {
 	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
+	unsigned long min_partial;
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
 #ifdef CONFIG_SLUB_DEBUG
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a23..4fff385b17a3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1335,7 +1335,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 		n = get_node(s, zone_to_nid(zone));
 
 		if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-				n->nr_partial > n->min_partial) {
+				n->nr_partial > s->min_partial) {
 			page = get_partial_node(n);
 			if (page)
 				return page;
@@ -1387,7 +1387,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
 		slab_unlock(page);
 	} else {
 		stat(c, DEACTIVATE_EMPTY);
-		if (n->nr_partial < n->min_partial) {
+		if (n->nr_partial < s->min_partial) {
 			/*
 			 * Adding an empty slab to the partial slabs in order
 			 * to avoid page allocator overhead. This slab needs
@@ -1928,17 +1928,6 @@ static void
 init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 {
 	n->nr_partial = 0;
-
-	/*
-	 * The larger the object size is, the more pages we want on the partial
-	 * list to avoid pounding the page allocator excessively.
-	 */
-	n->min_partial = ilog2(s->size);
-	if (n->min_partial < MIN_PARTIAL)
-		n->min_partial = MIN_PARTIAL;
-	else if (n->min_partial > MAX_PARTIAL)
-		n->min_partial = MAX_PARTIAL;
-
 	spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
@@ -2181,6 +2170,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
 }
 #endif
 
+static void calculate_min_partial(struct kmem_cache *s, unsigned long min)
+{
+	if (min < MIN_PARTIAL)
+		min = MIN_PARTIAL;
+	else if (min > MAX_PARTIAL)
+		min = MAX_PARTIAL;
+	s->min_partial = min;
+}
+
 /*
  * calculate_sizes() determines the order and the distribution of data within
  * a slab object.
@@ -2319,6 +2317,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 	if (!calculate_sizes(s, -1))
 		goto error;
 
+	/*
+	 * The larger the object size is, the more pages we want on the partial
+	 * list to avoid pounding the page allocator excessively.
+	 */
+	calculate_min_partial(s, ilog2(s->size));
 	s->refcount = 1;
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 1000;
-- 
cgit v1.2.3


From 73d342b169db700b5a6ad626fe4b86911efec8db Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Sun, 22 Feb 2009 17:40:09 -0800
Subject: slub: add min_partial sysfs tunable

Now that a cache's min_partial has been moved to struct kmem_cache, it's
possible to easily tune it from userspace by adding a sysfs attribute.

It may not be desirable to keep a large number of partial slabs around
if a cache is used infrequently and memory, especially when constrained
by a cgroup, is scarce.  It's better to allow userspace to set the
minimum policy per cache instead of relying explicitly on
kmem_cache_shrink().

The memory savings from simply moving min_partial from struct
kmem_cache_node to struct kmem_cache is obviously not significant
(unless maybe you're from SGI or something), at the largest it's

	# allocated caches * (MAX_NUMNODES - 1) * sizeof(unsigned long)

The true savings occurs when userspace reduces the number of partial
slabs that would otherwise be wasted, especially on machines with a
large number of nodes (ia64 with CONFIG_NODES_SHIFT at 10 for default?).
As well as the kernel estimates ideal values for n->min_partial and
ensures it's within a sane range, userspace has no other input other
than writing to /sys/kernel/slab/cache/shrink.

There simply isn't any better heuristic to add when calculating the
partial values for a better estimate that works for all possible caches.
And since it's currently a static value, the user really has no way of
reclaiming that wasted space, which can be significant when constrained
by a cgroup (either cpusets or, later, memory controller slab limits)
without shrinking it entirely.

This also allows the user to specify that increased fragmentation and
more partial slabs are actually desired to avoid the cost of allocating
new slabs at runtime for specific caches.

There's also no reason why this should be a per-struct kmem_cache_node
value in the first place.  You could argue that a machine would have
such node size asymmetries that it should be specified on a per-node
basis, but we know nobody is doing that right now since it's a purely
static value at the moment and there's no convenient way to tune that
via slub's sysfs interface.

Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 4fff385b17a3..a3e2d552ff46 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3838,6 +3838,26 @@ static ssize_t order_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR(order);
 
+static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
+{
+	return sprintf(buf, "%lu\n", s->min_partial);
+}
+
+static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
+				 size_t length)
+{
+	unsigned long min;
+	int err;
+
+	err = strict_strtoul(buf, 10, &min);
+	if (err)
+		return err;
+
+	calculate_min_partial(s, min);
+	return length;
+}
+SLAB_ATTR(min_partial);
+
 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 {
 	if (s->ctor) {
@@ -4153,6 +4173,7 @@ static struct attribute *slab_attrs[] = {
 	&object_size_attr.attr,
 	&objs_per_slab_attr.attr,
 	&order_attr.attr,
+	&min_partial_attr.attr,
 	&objects_attr.attr,
 	&objects_partial_attr.attr,
 	&total_objects_attr.attr,
-- 
cgit v1.2.3


From cb83b42e23bd6c4bf91793a320fbe83787c13596 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:20 +0900
Subject: percpu: fix pcpu_chunk_struct_size

Impact: fix short allocation leading to memory corruption

While dropping rvalue wrapping macros around global parameters,
pcpu_chunk_struct_size was set incorrectly resulting in shorter page
pointer array.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 997724c2ea24..ed92caa2aa3b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -850,7 +850,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
-		+ (1 << pcpu_unit_pages_shift) * sizeof(struct page *);
+		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
 	/* allocate chunk slots */
 	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
-- 
cgit v1.2.3


From c132937556f56ee4b831ef4b23f1846e05fde102 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:20 +0900
Subject: bootmem: clean up arch-specific bootmem wrapping

Impact: cleaner and consistent bootmem wrapping

By setting CONFIG_HAVE_ARCH_BOOTMEM_NODE, archs can define
arch-specific wrappers for bootmem allocation.  However, this is done
a bit strangely in that only the high level convenience macros can be
changed while lower level, but still exported, interface functions
can't be wrapped.  This not only is messy but also leads to strange
situation where alloc_bootmem() does what the arch wants it to do but
the equivalent __alloc_bootmem() call doesn't although they should be
able to be used interchangeably.

This patch updates bootmem such that archs can override / wrap the
backend function - alloc_bootmem_core() instead of the highlevel
interface functions to allow simpler and consistent wrapping.  Also,
HAVE_ARCH_BOOTMEM_NODE is renamed to HAVE_ARCH_BOOTMEM.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@saeurebad.de>
---
 arch/avr32/Kconfig               |  2 +-
 arch/x86/Kconfig                 |  2 +-
 arch/x86/include/asm/mmzone_32.h | 43 +++++-----------------------------------
 include/linux/bootmem.h          | 10 ++++------
 mm/bootmem.c                     | 14 ++++++++++---
 5 files changed, 22 insertions(+), 49 deletions(-)

(limited to 'mm')

diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b189680d18b0..05fe3053dcae 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
 config QUICKLIST
 	def_bool y
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool n
 
 config ARCH_HAVE_MEMORY_PRESENT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d3f6eadfd4ba..6fd3b2302ed9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1111,7 +1111,7 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accomodate various tables.
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool y
 	depends on X86_32 && NUMA
 
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 07f1af494ca5..1e0fa9e63afa 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -93,45 +93,12 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-
-/*
- * Following are macros that are specific to this numa platform.
- */
-#define reserve_bootmem(addr, size, flags) \
-	reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
-#define alloc_bootmem(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(pgdat, x)					\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,	\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_pages_node(pgdat, x)				\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,		\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_low_pages_node(pgdat, x)				\
+/* always use node 0 for bootmem on this numa platform */
+#define alloc_bootmem_core(__bdata, size, align, goal, limit)		\
 ({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);		\
+	bootmem_data_t __maybe_unused *	__abm_bdata_dummy = (__bdata);	\
+	__alloc_bootmem_core(NODE_DATA(0)->bdata,			\
+			     (size), (align), (goal), (limit));		\
 })
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 95837bfb5256..3a87f93081ed 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -69,10 +69,9 @@ extern int reserve_bootmem_node(pg_data_t *pgdat,
 				 unsigned long physaddr,
 				 unsigned long size,
 				 int flags);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
-#endif
-
+extern int reserve_bootmem(unsigned long addr,
+			   unsigned long size,
+			   int flags);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
@@ -94,7 +93,7 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
 				      unsigned long goal);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+
 #define alloc_bootmem(x) \
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
@@ -113,7 +112,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
 				   int flags);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 51a0ccf61e0e..d7140c008ba8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
 
 static int bootmem_debug;
 
+/*
+ * If an arch needs to apply workarounds to bootmem allocation, it can
+ * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
+ * __alloc_bootmem_core().
+ */
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM
+#define alloc_bootmem_core(bdata, size, align, goal, limit)		\
+	__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
+#endif
+
 static int __init bootmem_debug_setup(char *buf)
 {
 	bootmem_debug = 1;
@@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 	return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
 }
 
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 /**
  * reserve_bootmem - mark a page range as usable
  * @addr: starting address of the range
@@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 
 	return mark_bootmem(start, end, 1, flags);
 }
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
 			unsigned long step)
@@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
 	return ALIGN(base + off, align) - base;
 }
 
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
 				unsigned long size, unsigned long align,
 				unsigned long goal, unsigned long limit)
 {
-- 
cgit v1.2.3


From c0c0a29379b5848aec2e8f1c58d853d3cb7118b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: vmalloc: add @align to vm_area_register_early()

Impact: allow larger alignment for early vmalloc area allocation

Some early vmalloc users might want larger alignment, for example, for
custom large page mapping.  Add @align to vm_area_register_early().
While at it, drop docbook comment on non-existent @size.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
---
 arch/alpha/mm/init.c    |  2 +-
 include/linux/vmalloc.h |  2 +-
 mm/percpu.c             |  2 +-
 mm/vmalloc.c            | 11 +++++++----
 4 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index df6df025ded4..91eddd8505df 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -200,7 +200,7 @@ callback_init(void * kernel_end)
 		/* register the vm area */
 		console_remap_vm.flags = VM_ALLOC;
 		console_remap_vm.size = nr_pages << PAGE_SHIFT;
-		vm_area_register_early(&console_remap_vm);
+		vm_area_register_early(&console_remap_vm, PAGE_SIZE);
 
 		vaddr = (unsigned long)consle_remap_vm.addr;
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 599ba7984310..2f6994fdf0e0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -109,6 +109,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
  */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
-extern __init void vm_area_register_early(struct vm_struct *vm);
+extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/percpu.c b/mm/percpu.c
index ed92caa2aa3b..41e7a5f5ab1b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -860,7 +860,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	/* init and register vm area */
 	static_vm.flags = VM_ALLOC;
 	static_vm.size = pcpu_chunk_size;
-	vm_area_register_early(&static_vm);
+	vm_area_register_early(&static_vm, PAGE_SIZE);
 
 	/* init static_chunk */
 	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 224eca9650a8..366ae9ea6af2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -995,7 +995,7 @@ EXPORT_SYMBOL(vm_map_ram);
 /**
  * vm_area_register_early - register vmap area early during boot
  * @vm: vm_struct to register
- * @size: size of area to register
+ * @align: requested alignment
  *
  * This function is used to register kernel vm area before
  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
@@ -1004,12 +1004,15 @@ EXPORT_SYMBOL(vm_map_ram);
  *
  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
  */
-void __init vm_area_register_early(struct vm_struct *vm)
+void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 {
 	static size_t vm_init_off __initdata;
+	unsigned long addr;
+
+	addr = ALIGN(VMALLOC_START + vm_init_off, align);
+	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
 
-	vm->addr = (void *)VMALLOC_START + vm_init_off;
-	vm_init_off = PFN_ALIGN(vm_init_off + vm->size);
+	vm->addr = (void *)addr;
 
 	vm->next = vmlist;
 	vmlist = vm;
-- 
cgit v1.2.3


From d9b55eeb1d55ef2dc5a4fdbff9604c2c68cb5649 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: percpu: remove unit_size power-of-2 restriction

Impact: allow unit_size to be arbitrary multiple of PAGE_SIZE

In dynamic percpu allocator, there is no reason the unit size should
be power of two.  Remove the restriction.

As non-power-of-two unit size means that empty chunks fall into the
same slot index as lightly occupied chunks which is bad for reclaming.
Reserve an extra slot for empty chunks.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 41e7a5f5ab1b..d9e6e5d1dbd4 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -67,7 +67,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-#define PCPU_MIN_UNIT_PAGES_SHIFT	4	/* also max alloc size */
+#define PCPU_MIN_UNIT_PAGES		16	/* max alloc size in pages */
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 
@@ -83,9 +83,7 @@ struct pcpu_chunk {
 	struct page		*page[];	/* #cpus * UNIT_PAGES */
 };
 
-static int pcpu_unit_pages_shift;
 static int pcpu_unit_pages;
-static int pcpu_unit_shift;
 static int pcpu_unit_size;
 static int pcpu_chunk_size;
 static int pcpu_nr_slots;
@@ -117,12 +115,19 @@ static DEFINE_MUTEX(pcpu_mutex);
 static struct list_head *pcpu_slot;		/* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
-static int pcpu_size_to_slot(int size)
+static int __pcpu_size_to_slot(int size)
 {
 	int highbit = fls(size);	/* size is in bytes */
 	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 }
 
+static int pcpu_size_to_slot(int size)
+{
+	if (size == pcpu_unit_size)
+		return pcpu_nr_slots - 1;
+	return __pcpu_size_to_slot(size);
+}
+
 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 {
 	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
@@ -133,7 +138,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 
 static int pcpu_page_idx(unsigned int cpu, int page_idx)
 {
-	return (cpu << pcpu_unit_pages_shift) + page_idx;
+	return cpu * pcpu_unit_pages + page_idx;
 }
 
 static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
@@ -659,7 +664,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 		goto err;
 
 	for_each_possible_cpu(cpu)
-		memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0,
+		memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 		       size);
 
 	return 0;
@@ -722,7 +727,7 @@ void *__alloc_percpu(size_t size, size_t align)
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
-	if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT ||
+	if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE ||
 		     align > PAGE_SIZE)) {
 		WARN(true, "illegal size (%zu) or align (%zu) for "
 		     "percpu allocation\n", size, align);
@@ -840,19 +845,19 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	unsigned int cpu;
 	int err, i;
 
-	pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT,
-				      order_base_2(cpu_size) - PAGE_SHIFT);
+	pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size));
 
 	pcpu_static_size = cpu_size;
-	pcpu_unit_pages = 1 << pcpu_unit_pages_shift;
-	pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift;
-	pcpu_unit_size = 1 << pcpu_unit_shift;
+	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
-	pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
-	/* allocate chunk slots */
+	/*
+	 * Allocate chunk slots.  The additional last slot is for
+	 * empty chunks.
+	 */
+	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
 	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
-- 
cgit v1.2.3


From 8d408b4be37bc49c9086531f2ebe411cf5731746 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: percpu: give more latitude to arch specific first chunk
 initialization

Impact: more latitude for first percpu chunk allocation

The first percpu chunk serves the kernel static percpu area and may or
may not contain extra room for further dynamic allocation.
Initialization of the first chunk needs to be done before normal
memory allocation service is up, so it has its own init path -
pcpu_setup_static().

It seems archs need more latitude while initializing the first chunk
for example to take advantage of large page mapping.  This patch makes
the following changes to allow this.

* Define PERCPU_DYNAMIC_RESERVE to give arch hint about how much space
  to reserve in the first chunk for further dynamic allocation.

* Rename pcpu_setup_static() to pcpu_setup_first_chunk().

* Make pcpu_setup_first_chunk() much more flexible by fetching page
  pointer by callback and adding optional @unit_size, @free_size and
  @base_addr arguments which allow archs to selectively part of chunk
  initialization to their likings.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |  15 ++++-
 include/linux/percpu.h         |  39 ++++++++++-
 mm/percpu.c                    | 149 ++++++++++++++++++++++++++++++++---------
 3 files changed, 167 insertions(+), 36 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 671e6528a82d..d928e8887201 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -41,6 +41,16 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+	if (pageno < pcpu4k_nr_static_pages)
+		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+	return NULL;
+}
+
 static void __init pcpu4k_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
@@ -109,7 +119,10 @@ void __init setup_per_cpu_areas(void)
 		}
 	}
 
-	pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size);
+	pcpu4k_pages = pages;
+	pcpu4k_nr_static_pages = nr_cpu_pages;
+	pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0,
+						NULL, pcpu4k_populate_pte);
 
 	free_bootmem(__pa(pages), pages_size);
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 18080995ff3e..910beb0abea2 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -78,12 +78,47 @@
 
 #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
+/* minimum unit size, also is the maximum supported allocation size */
+#define PCPU_MIN_UNIT_SIZE		(16UL << PAGE_SHIFT)
+
+/*
+ * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
+ * back on the first chunk if arch is manually allocating and mapping
+ * it for faster access (as a part of large page mapping for example).
+ * Note that dynamic percpu allocator covers both static and dynamic
+ * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ *
+ * On typical configuration with modules, the following values leave
+ * about 8k of free space on the first chunk after boot on both x86_32
+ * and 64 when module support is enabled.  When module support is
+ * disabled, it's much tighter.
+ */
+#ifndef PERCPU_DYNAMIC_RESERVE
+#  if BITS_PER_LONG > 32
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE	(6 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#    endif
+#  else
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE	(2 << PAGE_SHIFT)
+#    endif
+#  endif
+#endif	/* PERCPU_DYNAMIC_RESERVE */
+
 extern void *pcpu_base_addr;
 
+typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
-extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
-				       struct page **pages, size_t cpu_size);
+extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+					size_t static_size, size_t unit_size,
+					size_t free_size, void *base_addr,
+					pcpu_populate_pte_fn_t populate_pte_fn);
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index d9e6e5d1dbd4..9ac01980cce0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -48,8 +48,8 @@
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back
  *
- * - use pcpu_setup_static() during percpu area initialization to
- *   setup kernel static percpu area
+ * - use pcpu_setup_first_chunk() during percpu area initialization to
+ *   setup the first chunk containing the kernel static percpu area
  */
 
 #include <linux/bitmap.h>
@@ -67,7 +67,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-#define PCPU_MIN_UNIT_PAGES		16	/* max alloc size in pages */
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 
@@ -80,6 +79,7 @@ struct pcpu_chunk {
 	int			map_used;	/* # of map entries used */
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
+	bool			immutable;	/* no [de]population allowed */
 	struct page		*page[];	/* #cpus * UNIT_PAGES */
 };
 
@@ -521,6 +521,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 	unsigned int last = num_possible_cpus() - 1;
 	unsigned int cpu;
 
+	/* unmap must not be done on immutable chunk */
+	WARN_ON(chunk->immutable);
+
 	/*
 	 * Each flushing trial can be very expensive, issue flush on
 	 * the whole region at once rather than doing it for each cpu.
@@ -602,6 +605,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 	unsigned int cpu;
 	int err;
 
+	/* map must not be done on immutable chunk */
+	WARN_ON(chunk->immutable);
+
 	for_each_possible_cpu(cpu) {
 		err = map_kernel_range_noflush(
 				pcpu_chunk_addr(chunk, cpu, page_start),
@@ -727,8 +733,7 @@ void *__alloc_percpu(size_t size, size_t align)
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
-	if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE ||
-		     align > PAGE_SIZE)) {
+	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
 		WARN(true, "illegal size (%zu) or align (%zu) for "
 		     "percpu allocation\n", size, align);
 		return NULL;
@@ -776,6 +781,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
+	WARN_ON(chunk->immutable);
 	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
 	list_del(&chunk->list);
 	rb_erase(&chunk->rb_node, &pcpu_addr_root);
@@ -821,33 +827,73 @@ void free_percpu(void *ptr)
 EXPORT_SYMBOL_GPL(free_percpu);
 
 /**
- * pcpu_setup_static - initialize kernel static percpu area
- * @populate_pte_fn: callback to allocate pagetable
- * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
- * @cpu_size: the size of static percpu area in bytes
- *
- * Initialize kernel static percpu area.  The caller should allocate
- * all the necessary pages and pass them in @pages.
- * @populate_pte_fn() is called on each page to be used for percpu
- * mapping and is responsible for making sure all the necessary page
- * tables for the page is allocated.
+ * pcpu_setup_first_chunk - initialize the first percpu chunk
+ * @get_page_fn: callback to fetch page pointer
+ * @static_size: the size of static percpu area in bytes
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
+ * @free_size: free size in bytes, 0 for auto
+ * @base_addr: mapped address, NULL for auto
+ * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ *
+ * Initialize the first percpu chunk which contains the kernel static
+ * perpcu area.  This function is to be called from arch percpu area
+ * setup path.  The first two parameters are mandatory.  The rest are
+ * optional.
+ *
+ * @get_page_fn() should return pointer to percpu page given cpu
+ * number and page number.  It should at least return enough pages to
+ * cover the static area.  The returned pages for static area should
+ * have been initialized with valid data.  If @unit_size is specified,
+ * it can also return pages after the static area.  NULL return
+ * indicates end of pages for the cpu.  Note that @get_page_fn() must
+ * return the same number of pages for all cpus.
+ *
+ * @unit_size, if non-zero, determines unit size and must be aligned
+ * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ *
+ * @free_size determines the number of free bytes after the static
+ * area in the first chunk.  If zero, whatever left is available.
+ * Specifying non-zero value make percpu leave the area after
+ * @static_size + @free_size alone.
+ *
+ * Non-null @base_addr means that the caller already allocated virtual
+ * region for the first chunk and mapped it.  percpu must not mess
+ * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
+ * @populate_pte_fn doesn't make any sense.
+ *
+ * @populate_pte_fn is used to populate the pagetable.  NULL means the
+ * caller already populated the pagetable.
  *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
-size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
-				struct page **pages, size_t cpu_size)
+size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+				     size_t static_size, size_t unit_size,
+				     size_t free_size, void *base_addr,
+				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct static_vm;
 	struct pcpu_chunk *static_chunk;
-	int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
 	unsigned int cpu;
+	int nr_pages;
 	int err, i;
 
-	pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size));
+	/* santiy checks */
+	BUG_ON(!static_size);
+	BUG_ON(!unit_size && free_size);
+	BUG_ON(unit_size && unit_size < static_size + free_size);
+	BUG_ON(unit_size & ~PAGE_MASK);
+	BUG_ON(base_addr && !unit_size);
+	BUG_ON(base_addr && populate_pte_fn);
 
-	pcpu_static_size = cpu_size;
+	if (unit_size)
+		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
+	else
+		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
+					PFN_UP(static_size));
+
+	pcpu_static_size = static_size;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -862,29 +908,66 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init and register vm area */
-	static_vm.flags = VM_ALLOC;
-	static_vm.size = pcpu_chunk_size;
-	vm_area_register_early(&static_vm, PAGE_SIZE);
-
 	/* init static_chunk */
 	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&static_chunk->list);
 	static_chunk->vm = &static_vm;
-	static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+
+	if (free_size)
+		static_chunk->free_size = free_size;
+	else
+		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+
 	static_chunk->contig_hint = static_chunk->free_size;
 
-	/* assign pages and map them */
+	/* allocate vm address */
+	static_vm.flags = VM_ALLOC;
+	static_vm.size = pcpu_chunk_size;
+
+	if (!base_addr)
+		vm_area_register_early(&static_vm, PAGE_SIZE);
+	else {
+		/*
+		 * Pages already mapped.  No need to remap into
+		 * vmalloc area.  In this case the static chunk can't
+		 * be mapped or unmapped by percpu and is marked
+		 * immutable.
+		 */
+		static_vm.addr = base_addr;
+		static_chunk->immutable = true;
+	}
+
+	/* assign pages */
+	nr_pages = -1;
 	for_each_possible_cpu(cpu) {
-		for (i = 0; i < nr_cpu_pages; i++) {
-			*pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
-			populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
+		for (i = 0; i < pcpu_unit_pages; i++) {
+			struct page *page = get_page_fn(cpu, i);
+
+			if (!page)
+				break;
+			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
 		}
+
+		BUG_ON(i < PFN_UP(pcpu_static_size));
+
+		if (nr_pages < 0)
+			nr_pages = i;
+		else
+			BUG_ON(nr_pages != i);
 	}
 
-	err = pcpu_map(static_chunk, 0, nr_cpu_pages);
-	if (err)
-		panic("failed to setup static percpu area, err=%d\n", err);
+	/* map them */
+	if (populate_pte_fn) {
+		for_each_possible_cpu(cpu)
+			for (i = 0; i < nr_pages; i++)
+				populate_pte_fn(pcpu_chunk_addr(static_chunk,
+								cpu, i));
+
+		err = pcpu_map(static_chunk, 0, nr_pages);
+		if (err)
+			panic("failed to setup static percpu area, err=%d\n",
+			      err);
+	}
 
 	/* link static_chunk in */
 	pcpu_chunk_relocate(static_chunk, -1);
-- 
cgit v1.2.3


From 40150d37be7f7949b2ec07d511244da856647d84 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 12:32:28 +0900
Subject: percpu: add __read_mostly to variables which are mostly read only

Most global variables in percpu allocator are initialized during boot
and read only from that point on.  Add __read_mostly as per Rusty's
suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 mm/percpu.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 9ac01980cce0..5954e7a9eb1e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -83,18 +83,18 @@ struct pcpu_chunk {
 	struct page		*page[];	/* #cpus * UNIT_PAGES */
 };
 
-static int pcpu_unit_pages;
-static int pcpu_unit_size;
-static int pcpu_chunk_size;
-static int pcpu_nr_slots;
-static size_t pcpu_chunk_struct_size;
+static int pcpu_unit_pages __read_mostly;
+static int pcpu_unit_size __read_mostly;
+static int pcpu_chunk_size __read_mostly;
+static int pcpu_nr_slots __read_mostly;
+static size_t pcpu_chunk_struct_size __read_mostly;
 
 /* the address of the first chunk which starts with the kernel static area */
-void *pcpu_base_addr;
+void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
 /* the size of kernel static area */
-static int pcpu_static_size;
+static int pcpu_static_size __read_mostly;
 
 /*
  * One mutex to rule them all.
@@ -112,7 +112,7 @@ static int pcpu_static_size;
  */
 static DEFINE_MUTEX(pcpu_mutex);
 
-static struct list_head *pcpu_slot;		/* chunk list slots */
+static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
 static int __pcpu_size_to_slot(int size)
-- 
cgit v1.2.3


From c0bdb232b23b51c23e551041510ad6bea5ce5a92 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 25 Feb 2009 09:16:35 +0200
Subject: slub: rename calculate_min_partial() to set_min_partial()

As suggested by Christoph Lameter, rename calculate_min_partial() to
set_min_partial() as the function doesn't really do any calculations.

Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index a3e2d552ff46..77268d18e78d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2170,7 +2170,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
 }
 #endif
 
-static void calculate_min_partial(struct kmem_cache *s, unsigned long min)
+static void set_min_partial(struct kmem_cache *s, unsigned long min)
 {
 	if (min < MIN_PARTIAL)
 		min = MIN_PARTIAL;
@@ -2321,7 +2321,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 	 * The larger the object size is, the more pages we want on the partial
 	 * list to avoid pounding the page allocator excessively.
 	 */
-	calculate_min_partial(s, ilog2(s->size));
+	set_min_partial(s, ilog2(s->size));
 	s->refcount = 1;
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 1000;
@@ -3853,7 +3853,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 	if (err)
 		return err;
 
-	calculate_min_partial(s, min);
+	set_min_partial(s, min);
 	return length;
 }
 SLAB_ATTR(min_partial);
-- 
cgit v1.2.3


From 3255aa2eb636a508fc82a73fabbb8aaf2ff23c0f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 08:21:52 +0100
Subject: x86, mm: pass in 'total' to __copy_from_user_*nocache()

Impact: cleanup, enable future change

Add a 'total bytes copied' parameter to __copy_from_user_*nocache(),
and update all the callsites.

The parameter is not used yet - architecture code can use it to
more intelligently decide whether the copy should be cached or
non-temporal.

Cc: Salman Qazi <sqazi@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess_32.h |  4 ++--
 arch/x86/include/asm/uaccess_64.h |  5 ++---
 drivers/gpu/drm/i915/i915_gem.c   |  2 +-
 include/linux/uaccess.h           |  4 ++--
 mm/filemap.c                      | 10 ++++++----
 mm/filemap_xip.c                  |  2 +-
 6 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 5e06259e90e5..a0ba61386972 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -157,7 +157,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 }
 
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
-				const void __user *from, unsigned long n)
+		const void __user *from, unsigned long n, unsigned long total)
 {
 	might_fault();
 	if (__builtin_constant_p(n)) {
@@ -180,7 +180,7 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 static __always_inline unsigned long
 __copy_from_user_inatomic_nocache(void *to, const void __user *from,
-				  unsigned long n)
+				  unsigned long n, unsigned long total)
 {
        return __copy_from_user_ll_nocache_nozero(to, from, n);
 }
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 987a2c10fe20..a748253db0c9 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -189,7 +189,7 @@ extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
 static inline int __copy_from_user_nocache(void *dst, const void __user *src,
-					   unsigned size)
+				   unsigned size, unsigned long total)
 {
 	might_sleep();
 	/*
@@ -205,8 +205,7 @@ static inline int __copy_from_user_nocache(void *dst, const void __user *src,
 }
 
 static inline int __copy_from_user_inatomic_nocache(void *dst,
-						    const void __user *src,
-						    unsigned size)
+	    const void __user *src, unsigned size, unsigned total)
 {
 	if (likely(size >= PAGE_SIZE))
 		return __copy_user_nocache(dst, src, size, 0);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 818576654092..6b209db8370d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -215,7 +215,7 @@ fast_user_write(struct io_mapping *mapping,
 
 	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base);
 	unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset,
-						      user_data, length);
+						      user_data, length, length);
 	io_mapping_unmap_atomic(vaddr_atomic);
 	if (unwritten)
 		return -EFAULT;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 6b58367d145e..6f3c603b0d67 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -41,13 +41,13 @@ static inline void pagefault_enable(void)
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
-				const void __user *from, unsigned long n)
+		const void __user *from, unsigned long n, unsigned long total)
 {
 	return __copy_from_user_inatomic(to, from, n);
 }
 
 static inline unsigned long __copy_from_user_nocache(void *to,
-				const void __user *from, unsigned long n)
+		const void __user *from, unsigned long n, unsigned long total)
 {
 	return __copy_from_user(to, from, n);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 23acefe51808..60fd56772cc6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1816,14 +1816,14 @@ EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
-	size_t copied = 0, left = 0;
+	size_t copied = 0, left = 0, total = bytes;
 
 	while (bytes) {
 		char __user *buf = iov->iov_base + base;
 		int copy = min(bytes, iov->iov_len - base);
 
 		base = 0;
-		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
+		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy, total);
 		copied += copy;
 		bytes -= copy;
 		vaddr += copy;
@@ -1851,8 +1851,9 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	if (likely(i->nr_segs == 1)) {
 		int left;
 		char __user *buf = i->iov->iov_base + i->iov_offset;
+
 		left = __copy_from_user_inatomic_nocache(kaddr + offset,
-							buf, bytes);
+							buf, bytes, bytes);
 		copied = bytes - left;
 	} else {
 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
@@ -1880,7 +1881,8 @@ size_t iov_iter_copy_from_user(struct page *page,
 	if (likely(i->nr_segs == 1)) {
 		int left;
 		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+
+		left = __copy_from_user_nocache(kaddr + offset, buf, bytes, bytes);
 		copied = bytes - left;
 	} else {
 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0c04615651b7..bf54f8a2cf1d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -354,7 +354,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 			break;
 
 		copied = bytes -
-			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
+			__copy_from_user_nocache(xip_mem + offset, buf, bytes, bytes);
 
 		if (likely(copied > 0)) {
 			status = copied;
-- 
cgit v1.2.3


From 34754b69a6f87aa6aa2860525a82f12532f83afd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 16:04:03 +0100
Subject: x86: make vmap yell louder when it is used under irqs_disabled()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/alternative.c | 6 +++---
 mm/vmalloc.c                  | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a84ac7b570e6..6907b8e85d52 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -498,12 +498,12 @@ void *text_poke_early(void *addr, const void *opcode, size_t len)
  */
 void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
 {
-	unsigned long flags;
 	char *vaddr;
 	int nr_pages = 2;
 	struct page *pages[2];
 	int i;
 
+	might_sleep();
 	if (!core_kernel_text((unsigned long)addr)) {
 		pages[0] = vmalloc_to_page(addr);
 		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
@@ -517,9 +517,9 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
 		nr_pages = 1;
 	vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
 	BUG_ON(!vaddr);
-	local_irq_save(flags);
+	local_irq_disable();
 	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
-	local_irq_restore(flags);
+	local_irq_enable();
 	vunmap(vaddr);
 	sync_core();
 	/* Could also do a CLFLUSH here to speed up CPU recovery; but
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4dd2636d0b92..f83a70167b99 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1257,6 +1257,7 @@ EXPORT_SYMBOL(vfree);
 void vunmap(const void *addr)
 {
 	BUG_ON(in_interrupt());
+	might_sleep();
 	__vunmap(addr, 0);
 }
 EXPORT_SYMBOL(vunmap);
@@ -1276,6 +1277,8 @@ void *vmap(struct page **pages, unsigned int count,
 {
 	struct vm_struct *area;
 
+	might_sleep();
+
 	if (count > num_physpages)
 		return NULL;
 
-- 
cgit v1.2.3


From 02d51fdfb2bfcf6bbd776f983177f55868aa0a79 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Mar 2009 15:42:36 +0900
Subject: percpu: kill compile warning in pcpu_populate_chunk()

Impact: remove compile warning

Mark local variable map_end in pcpu_populate_chunk() with
uninitialized_var().  The variable is always used in tandem with
map_start and guaranteed to be initialized before use but gcc doesn't
understand that.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
---
 mm/percpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 5954e7a9eb1e..3d0f5456827c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -639,7 +639,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
 	int map_start = -1;
-	int map_end;
+	int uninitialized_var(map_end);
 	unsigned int cpu;
 	int i;
 
-- 
cgit v1.2.3


From d0c4f570276cb4d2dc4215b90eb7cb6e2bdd4a15 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Mar 2009 16:06:56 +0900
Subject: bootmem, x86: further fixes for arch-specific bootmem wrapping

Impact: fix new breakages introduced by previous fix

Commit c132937556f56ee4b831ef4b23f1846e05fde102 tried to clean up
bootmem arch wrapper but it wasn't quite correct.  Before the commit,
the followings were broken.

* Low level interface functions prefixed with __ ignored arch
  preference.

* reserve_bootmem(...) can't be mapped into
  reserve_bootmem_node(NODE_DATA(0)->bdata, ...) because the node is
  not preference here.  The region specified MUST fall into the
  specified region; otherwise, it will panic.

After the commit,

* If allocation fails for the arch preferred node, it should fallback
  to whatever is available.  Instead, it simply failed allocation.

There are too many internal details to allow generic wrapping and
still keep things simple for archs.  Plus, all that arch wants is a
way to prefer certain node over another.

This patch drops the generic wrapping around alloc_bootmem_core() and
add alloc_bootmem_core() instead.  If necessary, arch can define
bootmem_arch_referred_node() macro or function which takes all
allocation information and returns the preferred node.  bootmem
generic code will always try the preferred node first and then
fallback to other nodes as usual.

Breakages noted and changes reviewed by Johannes Weiner.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
 arch/x86/include/asm/mmzone_32.h |  8 ++-----
 mm/bootmem.c                     | 45 ++++++++++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index eeacf67de49e..ede6998bd92c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -92,12 +92,8 @@ static inline int pfn_valid(int pfn)
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 /* always use node 0 for bootmem on this numa platform */
-#define alloc_bootmem_core(__bdata, size, align, goal, limit)		\
-({									\
-	bootmem_data_t __maybe_unused *	__abm_bdata_dummy = (__bdata);	\
-	__alloc_bootmem_core(NODE_DATA(0)->bdata,			\
-			     (size), (align), (goal), (limit));		\
-})
+#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit)	\
+	(NODE_DATA(0)->bdata)
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
 #endif /* _ASM_X86_MMZONE_32_H */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d7140c008ba8..daf92713f7de 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -37,16 +37,6 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
 
 static int bootmem_debug;
 
-/*
- * If an arch needs to apply workarounds to bootmem allocation, it can
- * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
- * __alloc_bootmem_core().
- */
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM
-#define alloc_bootmem_core(bdata, size, align, goal, limit)		\
-	__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
-#endif
-
 static int __init bootmem_debug_setup(char *buf)
 {
 	bootmem_debug = 1;
@@ -436,9 +426,9 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
 	return ALIGN(base + off, align) - base;
 }
 
-static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
-				unsigned long size, unsigned long align,
-				unsigned long goal, unsigned long limit)
+static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+					unsigned long size, unsigned long align,
+					unsigned long goal, unsigned long limit)
 {
 	unsigned long fallback = 0;
 	unsigned long min, max, start, sidx, midx, step;
@@ -538,17 +528,34 @@ find_block:
 	return NULL;
 }
 
+static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
+					unsigned long size, unsigned long align,
+					unsigned long goal, unsigned long limit)
+{
+#ifdef CONFIG_HAVE_ARCH_BOOTMEM
+	bootmem_data_t *p_bdata;
+
+	p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit);
+	if (p_bdata)
+		return alloc_bootmem_core(p_bdata, size, align, goal, limit);
+#endif
+	return NULL;
+}
+
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
 					unsigned long limit)
 {
 	bootmem_data_t *bdata;
+	void *region;
 
 restart:
-	list_for_each_entry(bdata, &bdata_list, list) {
-		void *region;
+	region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
+	if (region)
+		return region;
 
+	list_for_each_entry(bdata, &bdata_list, list) {
 		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
 			continue;
 		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
@@ -626,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 {
 	void *ptr;
 
+	ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+	if (ptr)
+		return ptr;
+
 	ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
 	if (ptr)
 		return ptr;
@@ -682,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
+	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
+	if (ptr)
+		return ptr;
+
 	ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return ptr;
-- 
cgit v1.2.3


From f180053694b43d5714bf56cb95499a3c32ff155c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 2 Mar 2009 11:00:57 +0100
Subject: x86, mm: dont use non-temporal stores in pagecache accesses

Impact: standardize IO on cached ops

On modern CPUs it is almost always a bad idea to use non-temporal stores,
as the regression in this commit has shown it:

  30d697f: x86: fix performance regression in write() syscall

The kernel simply has no good information about whether using non-temporal
stores is a good idea or not - and trying to add heuristics only increases
complexity and inserts fragility.

The regression on cached write()s took very long to be found - over two
years. So dont take any chances and let the hardware decide how it makes
use of its caches.

The only exception is drivers/gpu/drm/i915/i915_gem.c: there were we are
absolutely sure that another entity (the GPU) will pick up the dirty
data immediately and that the CPU will not touch that data before the
GPU will.

Also, keep the _nocache() primitives to make it easier for people to
experiment with these details. There may be more clear-cut cases where
non-cached copies can be used, outside of filemap.c.

Cc: Salman Qazi <sqazi@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess_32.h |  4 ++--
 arch/x86/include/asm/uaccess_64.h | 25 +++++++------------------
 drivers/gpu/drm/i915/i915_gem.c   |  2 +-
 include/linux/uaccess.h           |  4 ++--
 mm/filemap.c                      | 11 ++++-------
 mm/filemap_xip.c                  |  2 +-
 6 files changed, 17 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index a0ba61386972..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -157,7 +157,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 }
 
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	might_fault();
 	if (__builtin_constant_p(n)) {
@@ -180,7 +180,7 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 static __always_inline unsigned long
 __copy_from_user_inatomic_nocache(void *to, const void __user *from,
-				  unsigned long n, unsigned long total)
+				  unsigned long n)
 {
        return __copy_from_user_ll_nocache_nozero(to, from, n);
 }
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index dcaa0404cf7b..8cc687326eb8 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -188,29 +188,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
-static inline int __copy_from_user_nocache(void *dst, const void __user *src,
-				   unsigned size, unsigned long total)
+static inline int
+__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
 {
 	might_sleep();
-	/*
-	 * In practice this limit means that large file write()s
-	 * which get chunked to 4K copies get handled via
-	 * non-temporal stores here. Smaller writes get handled
-	 * via regular __copy_from_user():
-	 */
-	if (likely(total >= PAGE_SIZE))
-		return __copy_user_nocache(dst, src, size, 1);
-	else
-		return __copy_from_user(dst, src, size);
+	return __copy_user_nocache(dst, src, size, 1);
 }
 
-static inline int __copy_from_user_inatomic_nocache(void *dst,
-	    const void __user *src, unsigned size, unsigned total)
+static inline int
+__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
+				  unsigned size)
 {
-	if (likely(total >= PAGE_SIZE))
-		return __copy_user_nocache(dst, src, size, 0);
-	else
-		return __copy_from_user_inatomic(dst, src, size);
+	return __copy_user_nocache(dst, src, size, 0);
 }
 
 unsigned long
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6b209db8370d..818576654092 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -215,7 +215,7 @@ fast_user_write(struct io_mapping *mapping,
 
 	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base);
 	unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset,
-						      user_data, length, length);
+						      user_data, length);
 	io_mapping_unmap_atomic(vaddr_atomic);
 	if (unwritten)
 		return -EFAULT;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 6f3c603b0d67..6b58367d145e 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -41,13 +41,13 @@ static inline void pagefault_enable(void)
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	return __copy_from_user_inatomic(to, from, n);
 }
 
 static inline unsigned long __copy_from_user_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	return __copy_from_user(to, from, n);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 60fd56772cc6..126d3973b3d1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1816,14 +1816,14 @@ EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
-	size_t copied = 0, left = 0, total = bytes;
+	size_t copied = 0, left = 0;
 
 	while (bytes) {
 		char __user *buf = iov->iov_base + base;
 		int copy = min(bytes, iov->iov_len - base);
 
 		base = 0;
-		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy, total);
+		left = __copy_from_user_inatomic(vaddr, buf, copy);
 		copied += copy;
 		bytes -= copy;
 		vaddr += copy;
@@ -1851,9 +1851,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	if (likely(i->nr_segs == 1)) {
 		int left;
 		char __user *buf = i->iov->iov_base + i->iov_offset;
-
-		left = __copy_from_user_inatomic_nocache(kaddr + offset,
-							buf, bytes, bytes);
+		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
 		copied = bytes - left;
 	} else {
 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
@@ -1881,8 +1879,7 @@ size_t iov_iter_copy_from_user(struct page *page,
 	if (likely(i->nr_segs == 1)) {
 		int left;
 		char __user *buf = i->iov->iov_base + i->iov_offset;
-
-		left = __copy_from_user_nocache(kaddr + offset, buf, bytes, bytes);
+		left = __copy_from_user(kaddr + offset, buf, bytes);
 		copied = bytes - left;
 	} else {
 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index bf54f8a2cf1d..0c04615651b7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -354,7 +354,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 			break;
 
 		copied = bytes -
-			__copy_from_user_nocache(xip_mem + offset, buf, bytes, bytes);
+			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
 
 		if (likely(copied > 0)) {
 			status = copied;
-- 
cgit v1.2.3


From 2441d15c97d498b18f03ae9fba262ffeae42a08b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: cosmetic renames in pcpu_setup_first_chunk()

Impact: cosmetic, preparation for future changes

Make the following renames in pcpur_setup_first_chunk() in preparation
for future changes.

* s/free_size/dyn_size/
* s/static_vm/first_vm/
* s/static_chunk/schunk/

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h |  2 +-
 mm/percpu.c            | 58 +++++++++++++++++++++++++-------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'mm')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 2d34b038fe70..a0b4ea2a3354 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -118,7 +118,7 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 					size_t static_size, size_t unit_size,
-					size_t free_size, void *base_addr,
+					size_t dyn_size, void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 3d0f5456827c..9531590e6b69 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @free_size: free size in bytes, 0 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -849,12 +849,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * return the same number of pages for all cpus.
  *
  * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
  *
- * @free_size determines the number of free bytes after the static
+ * @dyn_size determines the number of free bytes after the static
  * area in the first chunk.  If zero, whatever left is available.
  * Specifying non-zero value make percpu leave the area after
- * @static_size + @free_size alone.
+ * @static_size + @dyn_size alone.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -870,19 +870,19 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t unit_size,
-				     size_t free_size, void *base_addr,
+				     size_t dyn_size, void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
-	static struct vm_struct static_vm;
-	struct pcpu_chunk *static_chunk;
+	static struct vm_struct first_vm;
+	struct pcpu_chunk *schunk;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
 	BUG_ON(!static_size);
-	BUG_ON(!unit_size && free_size);
-	BUG_ON(unit_size && unit_size < static_size + free_size);
+	BUG_ON(!unit_size && dyn_size);
+	BUG_ON(unit_size && unit_size < static_size + dyn_size);
 	BUG_ON(unit_size & ~PAGE_MASK);
 	BUG_ON(base_addr && !unit_size);
 	BUG_ON(base_addr && populate_pte_fn);
@@ -908,24 +908,24 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static_chunk */
-	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
-	INIT_LIST_HEAD(&static_chunk->list);
-	static_chunk->vm = &static_vm;
+	/* init static chunk */
+	schunk = alloc_bootmem(pcpu_chunk_struct_size);
+	INIT_LIST_HEAD(&schunk->list);
+	schunk->vm = &first_vm;
 
-	if (free_size)
-		static_chunk->free_size = free_size;
+	if (dyn_size)
+		schunk->free_size = dyn_size;
 	else
-		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+		schunk->free_size = pcpu_unit_size - pcpu_static_size;
 
-	static_chunk->contig_hint = static_chunk->free_size;
+	schunk->contig_hint = schunk->free_size;
 
 	/* allocate vm address */
-	static_vm.flags = VM_ALLOC;
-	static_vm.size = pcpu_chunk_size;
+	first_vm.flags = VM_ALLOC;
+	first_vm.size = pcpu_chunk_size;
 
 	if (!base_addr)
-		vm_area_register_early(&static_vm, PAGE_SIZE);
+		vm_area_register_early(&first_vm, PAGE_SIZE);
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
@@ -933,8 +933,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		 * be mapped or unmapped by percpu and is marked
 		 * immutable.
 		 */
-		static_vm.addr = base_addr;
-		static_chunk->immutable = true;
+		first_vm.addr = base_addr;
+		schunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -945,7 +945,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 			if (!page)
 				break;
-			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
+			*pcpu_chunk_pagep(schunk, cpu, i) = page;
 		}
 
 		BUG_ON(i < PFN_UP(pcpu_static_size));
@@ -960,20 +960,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	if (populate_pte_fn) {
 		for_each_possible_cpu(cpu)
 			for (i = 0; i < nr_pages; i++)
-				populate_pte_fn(pcpu_chunk_addr(static_chunk,
+				populate_pte_fn(pcpu_chunk_addr(schunk,
 								cpu, i));
 
-		err = pcpu_map(static_chunk, 0, nr_pages);
+		err = pcpu_map(schunk, 0, nr_pages);
 		if (err)
 			panic("failed to setup static percpu area, err=%d\n",
 			      err);
 	}
 
-	/* link static_chunk in */
-	pcpu_chunk_relocate(static_chunk, -1);
-	pcpu_chunk_addr_insert(static_chunk);
+	/* link the first chunk in */
+	pcpu_chunk_relocate(schunk, -1);
+	pcpu_chunk_addr_insert(schunk);
 
 	/* we're done */
-	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 	return pcpu_unit_size;
 }
-- 
cgit v1.2.3


From 61ace7fa2fff9c4b6641c506b6b3f1a9394a1b11 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: improve first chunk initial area map handling

Impact: no functional change

When the first chunk is created, its initial area map is not allocated
because kmalloc isn't online yet.  The map is allocated and
initialized on the first allocation request on the chunk.  This works
fine but the scattering of initialization logic between the init
function and allocation path is a bit confusing.

This patch makes the first chunk initialize and use minimal statically
allocated map from pcpu_setpu_first_chunk().  The map resizing path
still needs to handle this specially but it's more straight-forward
and gives more latitude to the init path.  This will ease future
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 53 +++++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 9531590e6b69..503ccad091af 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -93,9 +93,6 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* the size of kernel static area */
-static int pcpu_static_size __read_mostly;
-
 /*
  * One mutex to rule them all.
  *
@@ -316,15 +313,28 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 
 	/* reallocation required? */
 	if (chunk->map_alloc < target) {
-		int new_alloc = chunk->map_alloc;
+		int new_alloc;
 		int *new;
 
+		new_alloc = PCPU_DFL_MAP_ALLOC;
 		while (new_alloc < target)
 			new_alloc *= 2;
 
-		new = pcpu_realloc(chunk->map,
-				   chunk->map_alloc * sizeof(new[0]),
-				   new_alloc * sizeof(new[0]));
+		if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
+			/*
+			 * map_alloc smaller than the default size
+			 * indicates that the chunk is one of the
+			 * first chunks and still using static map.
+			 * Allocate a dynamic one and copy.
+			 */
+			new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
+			if (new)
+				memcpy(new, chunk->map,
+				       chunk->map_alloc * sizeof(new[0]));
+		} else
+			new = pcpu_realloc(chunk->map,
+					   chunk->map_alloc * sizeof(new[0]),
+					   new_alloc * sizeof(new[0]));
 		if (!new)
 			return -ENOMEM;
 
@@ -367,22 +377,6 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 	int max_contig = 0;
 	int i, off;
 
-	/*
-	 * The static chunk initially doesn't have map attached
-	 * because kmalloc wasn't available during init.  Give it one.
-	 */
-	if (unlikely(!chunk->map)) {
-		chunk->map = pcpu_realloc(NULL, 0,
-				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-		if (!chunk->map)
-			return -ENOMEM;
-
-		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
-		chunk->map[chunk->map_used++] = -pcpu_static_size;
-		if (chunk->free_size)
-			chunk->map[chunk->map_used++] = chunk->free_size;
-	}
-
 	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
 		bool is_last = i + 1 == chunk->map_used;
 		int head, tail;
@@ -874,12 +868,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
+	static int smap[2];
 	struct pcpu_chunk *schunk;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
+	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	BUG_ON(!unit_size && dyn_size);
 	BUG_ON(unit_size && unit_size < static_size + dyn_size);
@@ -893,7 +889,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 					PFN_UP(static_size));
 
-	pcpu_static_size = static_size;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -912,14 +907,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->vm = &first_vm;
+	schunk->map = smap;
+	schunk->map_alloc = ARRAY_SIZE(smap);
 
 	if (dyn_size)
 		schunk->free_size = dyn_size;
 	else
-		schunk->free_size = pcpu_unit_size - pcpu_static_size;
+		schunk->free_size = pcpu_unit_size - static_size;
 
 	schunk->contig_hint = schunk->free_size;
 
+	schunk->map[schunk->map_used++] = -static_size;
+	if (schunk->free_size)
+		schunk->map[schunk->map_used++] = schunk->free_size;
+
 	/* allocate vm address */
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
@@ -948,7 +949,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 			*pcpu_chunk_pagep(schunk, cpu, i) = page;
 		}
 
-		BUG_ON(i < PFN_UP(pcpu_static_size));
+		BUG_ON(i < PFN_UP(static_size));
 
 		if (nr_pages < 0)
 			nr_pages = i;
-- 
cgit v1.2.3


From cafe8816b217b98dc3f268d3b77445da498beb4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: use negative for auto for pcpu_setup_first_chunk() arguments

Impact: argument semantic cleanup

In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant
auto-sizing.  It's okay for @unit_size as 0 doesn't make sense but 0
dynamic reserve size is valid.  Alos, if arch @dyn_size is calculated
from other parameters, it might end up passing in 0 @dyn_size and
malfunction when the size is automatically adjusted.

This patch makes both @unit_size and @dyn_size ssize_t and use -1 for
auto sizing.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |  2 +-
 include/linux/percpu.h         |  5 +++--
 mm/percpu.c                    | 46 +++++++++++++++++++++++-------------------
 3 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index c29f301d3885..ef3a2cd3fe64 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -344,7 +344,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
 				     pcpu4k_populate_pte);
 	goto out_free_ar;
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a0b4ea2a3354..a96fc53bbd62 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,8 +117,9 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size, size_t unit_size,
-					size_t dyn_size, void *base_addr,
+					size_t static_size,
+					ssize_t unit_size, ssize_t dyn_size,
+					void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 503ccad091af..a84cf9977faf 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -824,8 +824,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -842,13 +842,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
- * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
+ * @unit_size, if non-negative, specifies unit size and must be
+ * aligned to PAGE_SIZE and equal to or larger than @static_size +
+ * @dyn_size.
  *
- * @dyn_size determines the number of free bytes after the static
- * area in the first chunk.  If zero, whatever left is available.
- * Specifying non-zero value make percpu leave the area after
- * @static_size + @dyn_size alone.
+ * @dyn_size, if non-negative, limits the number of bytes available
+ * for dynamic allocation in the first chunk.  Specifying non-negative
+ * value make percpu leave alone the area beyond @static_size +
+ * @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -863,8 +864,9 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size, size_t unit_size,
-				     size_t dyn_size, void *base_addr,
+				     size_t static_size,
+				     ssize_t unit_size, ssize_t dyn_size,
+				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
@@ -877,13 +879,17 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	/* santiy checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
-	BUG_ON(!unit_size && dyn_size);
-	BUG_ON(unit_size && unit_size < static_size + dyn_size);
-	BUG_ON(unit_size & ~PAGE_MASK);
-	BUG_ON(base_addr && !unit_size);
+	if (unit_size >= 0) {
+		BUG_ON(unit_size < static_size +
+				   (dyn_size >= 0 ? dyn_size : 0));
+		BUG_ON(unit_size & ~PAGE_MASK);
+	} else {
+		BUG_ON(dyn_size >= 0);
+		BUG_ON(base_addr);
+	}
 	BUG_ON(base_addr && populate_pte_fn);
 
-	if (unit_size)
+	if (unit_size >= 0)
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
@@ -894,6 +900,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
+	if (dyn_size < 0)
+		dyn_size = pcpu_unit_size - static_size;
+
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
@@ -909,12 +918,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
-
-	if (dyn_size)
-		schunk->free_size = dyn_size;
-	else
-		schunk->free_size = pcpu_unit_size - static_size;
-
+	schunk->free_size = dyn_size;
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
-- 
cgit v1.2.3


From 3e24aa58907c62bc79d1094e941a374568f62522 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: add an indirection ptr for chunk page map access

Impact: allow sharing page map, no functional difference yet

Make chunk->page access indirect by adding a pointer and renaming the
actual array to page_ar.  This will be used by future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index a84cf9977faf..5b47d9fe65f5 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,7 +80,8 @@ struct pcpu_chunk {
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
 	bool			immutable;	/* no [de]population allowed */
-	struct page		*page[];	/* #cpus * UNIT_PAGES */
+	struct page		**page;		/* points to page array */
+	struct page		*page_ar[];	/* #cpus * UNIT_PAGES */
 };
 
 static int pcpu_unit_pages __read_mostly;
@@ -696,6 +697,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
+	chunk->page = chunk->page_ar;
 
 	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 	if (!chunk->vm) {
@@ -918,6 +920,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
+	schunk->page = schunk->page_ar;
 	schunk->free_size = dyn_size;
 	schunk->contig_hint = schunk->free_size;
 
-- 
cgit v1.2.3


From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu, module: implement reserved allocation and use it for module
 percpu variables

Impact: add reserved allocation functionality and use it for module
	percpu variables

This patch implements reserved allocation from the first chunk.  When
setting up the first chunk, arch can ask to set aside certain number
of bytes right after the core static area which is available only
through a separate reserved allocator.  This will be used primarily
for module static percpu variables on architectures with limited
relocation range to ensure that the module perpcu symbols are inside
the relocatable range.

If reserved area is requested, the first chunk becomes reserved and
isn't available for regular allocation.  If the first chunk also
includes piggy-back dynamic allocation area, a separate chunk mapping
the same region is created to serve dynamic allocation.  The first one
is called static first chunk and the second dynamic first chunk.
Although they share the page map, their different area map
initializations guarantee they serve disjoint areas according to their
purposes.

If arch doesn't setup reserved area, reserved allocation is handled
like any other allocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |   8 +--
 include/linux/percpu.h         |  10 +--
 kernel/module.c                |   2 +-
 mm/percpu.c                    | 153 +++++++++++++++++++++++++++++++++++------
 4 files changed, 144 insertions(+), 29 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 38e2b2a470a5..dd4eabc747c8 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -217,7 +217,7 @@ proceed:
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
 				     pcpur_size - static_size, vm.addr, NULL);
 	goto out_free_ar;
 
@@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
 				      pcpue_unit_size, dyn_size,
 				      pcpue_ptr, NULL);
 }
@@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
-				     pcpu4k_populate_pte);
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
+				     NULL, pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a96fc53bbd62..8ff15153ae20 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,10 +117,10 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size,
-					ssize_t unit_size, ssize_t dyn_size,
-					void *base_addr,
-					pcpu_populate_pte_fn_t populate_pte_fn);
+				size_t static_size, size_t reserved_size,
+				ssize_t unit_size, ssize_t dyn_size,
+				void *base_addr,
+				pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
  * Use this to get to a cpu's version of the per-cpu object
@@ -129,6 +129,8 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
  */
 #define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
 
+extern void *__alloc_reserved_percpu(size_t size, size_t align);
+
 #else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 
 struct percpu_data {
diff --git a/kernel/module.c b/kernel/module.c
index 1f0657ae555b..f0e04d6b67d8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 		align = PAGE_SIZE;
 	}
 
-	ptr = __alloc_percpu(size, align);
+	ptr = __alloc_reserved_percpu(size, align);
 	if (!ptr)
 		printk(KERN_WARNING
 		       "Could not allocate %lu bytes percpu data\n", size);
diff --git a/mm/percpu.c b/mm/percpu.c
index 5b47d9fe65f5..ef8e169b7731 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
+/* optional reserved chunk, only accessible for reserved allocations */
+static struct pcpu_chunk *pcpu_reserved_chunk;
+/* offset limit of the reserved chunk */
+static int pcpu_reserved_chunk_limit;
+
 /*
  * One mutex to rule them all.
  *
@@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size)
  *
  * This function is called after an allocation or free changed @chunk.
  * New slot according to the changed state is determined and @chunk is
- * moved to the slot.
+ * moved to the slot.  Note that the reserved chunk is never put on
+ * chunk slots.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
 	int nslot = pcpu_chunk_slot(chunk);
 
-	if (oslot != nslot) {
+	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
 		if (oslot < nslot)
 			list_move(&chunk->list, &pcpu_slot[nslot]);
 		else
@@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 	struct rb_node *n, *parent;
 	struct pcpu_chunk *chunk;
 
+	/* is it in the reserved chunk? */
+	if (pcpu_reserved_chunk) {
+		void *start = pcpu_reserved_chunk->vm->addr;
+
+		if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+			return pcpu_reserved_chunk;
+	}
+
+	/* nah... search the regular ones */
 	n = *pcpu_chunk_rb_search(addr, &parent);
 	if (!n) {
 		/* no exactly matching chunk, the parent is the closest */
@@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 }
 
 /**
- * __alloc_percpu - allocate percpu area
+ * pcpu_alloc - the percpu allocator
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
  *
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
@@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-void *__alloc_percpu(size_t size, size_t align)
+static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
 	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
@@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align)
 
 	mutex_lock(&pcpu_mutex);
 
-	/* allocate area */
+	/* serve reserved allocations from the reserved chunk if available */
+	if (reserved && pcpu_reserved_chunk) {
+		chunk = pcpu_reserved_chunk;
+		if (size > chunk->contig_hint)
+			goto out_unlock;
+		off = pcpu_alloc_area(chunk, size, align);
+		if (off >= 0)
+			goto area_found;
+		goto out_unlock;
+	}
+
+	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
@@ -773,8 +800,41 @@ out_unlock:
 	mutex_unlock(&pcpu_mutex);
 	return ptr;
 }
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, false);
+}
 EXPORT_SYMBOL_GPL(__alloc_percpu);
 
+/**
+ * __alloc_reserved_percpu - allocate reserved percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align from reserved
+ * percpu area if arch has set it up; otherwise, allocation is served
+ * from the same dynamic area.  Might sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_reserved_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, true);
+}
+
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
 	WARN_ON(chunk->immutable);
@@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
@@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
+ * @reserved_size, if non-zero, specifies the amount of bytes to
+ * reserve after the static area in the first chunk.  This reserves
+ * the first chunk such that it's available only through reserved
+ * percpu allocation.  This is primarily used to serve module percpu
+ * static areas on architectures where the addressing model has
+ * limited offset range for symbol relocations to guarantee module
+ * percpu symbols fall inside the relocatable range.
+ *
  * @unit_size, if non-negative, specifies unit size and must be
  * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * @dyn_size, if non-negative, limits the number of bytes available
  * for dynamic allocation in the first chunk.  Specifying non-negative
  * value make percpu leave alone the area beyond @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @populate_pte_fn is used to populate the pagetable.  NULL means the
  * caller already populated the pagetable.
  *
+ * If the first chunk ends up with both reserved and dynamic areas, it
+ * is served by two chunks - one to serve the core static and reserved
+ * areas and the other for the dynamic area.  They share the same vm
+ * and page map but uses different area allocation map to stay away
+ * from each other.  The latter chunk is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunks.
+ *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size,
+				     size_t static_size, size_t reserved_size,
 				     ssize_t unit_size, ssize_t dyn_size,
 				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
-	static int smap[2];
-	struct pcpu_chunk *schunk;
+	static int smap[2], dmap[2];
+	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
-	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
+	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
+		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	if (unit_size >= 0) {
-		BUG_ON(unit_size < static_size +
+		BUG_ON(unit_size < static_size + reserved_size +
 				   (dyn_size >= 0 ? dyn_size : 0));
 		BUG_ON(unit_size & ~PAGE_MASK);
 	} else {
@@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(static_size));
+					PFN_UP(static_size + reserved_size));
 
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
@@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
 	if (dyn_size < 0)
-		dyn_size = pcpu_unit_size - static_size;
+		dyn_size = pcpu_unit_size - static_size - reserved_size;
 
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
@@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static chunk */
+	/*
+	 * Initialize static chunk.  If reserved_size is zero, the
+	 * static chunk covers static area + dynamic allocation area
+	 * in the first chunk.  If reserved_size is not zero, it
+	 * covers static area + reserved area (mostly used for module
+	 * static percpu allocation).
+	 */
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
 	schunk->page = schunk->page_ar;
-	schunk->free_size = dyn_size;
+
+	if (reserved_size) {
+		schunk->free_size = reserved_size;
+		pcpu_reserved_chunk = schunk;	/* not for dynamic alloc */
+	} else {
+		schunk->free_size = dyn_size;
+		dyn_size = 0;			/* dynamic area covered */
+	}
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
+	pcpu_reserved_chunk_limit = static_size + schunk->free_size;
+
+	/* init dynamic chunk if necessary */
+	if (dyn_size) {
+		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+		INIT_LIST_HEAD(&dchunk->list);
+		dchunk->vm = &first_vm;
+		dchunk->map = dmap;
+		dchunk->map_alloc = ARRAY_SIZE(dmap);
+		dchunk->page = schunk->page_ar;	/* share page map with schunk */
+
+		dchunk->contig_hint = dchunk->free_size = dyn_size;
+		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
+		dchunk->map[dchunk->map_used++] = dchunk->free_size;
+	}
+
 	/* allocate vm address */
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
@@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
-		 * vmalloc area.  In this case the static chunk can't
-		 * be mapped or unmapped by percpu and is marked
+		 * vmalloc area.  In this case the first chunks can't
+		 * be mapped or unmapped by percpu and are marked
 		 * immutable.
 		 */
 		first_vm.addr = base_addr;
 		schunk->immutable = true;
+		if (dchunk)
+			dchunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	}
 
 	/* link the first chunk in */
-	pcpu_chunk_relocate(schunk, -1);
-	pcpu_chunk_addr_insert(schunk);
+	if (!dchunk) {
+		pcpu_chunk_relocate(schunk, -1);
+		pcpu_chunk_addr_insert(schunk);
+	} else {
+		pcpu_chunk_relocate(dchunk, -1);
+		pcpu_chunk_addr_insert(dchunk);
+	}
 
 	/* we're done */
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
-- 
cgit v1.2.3


From 1880d93b80acc3171850e9df5048bcb26b75c2f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:09 +0900
Subject: percpu: replace pcpu_realloc() with pcpu_mem_alloc() and
 pcpu_mem_free()

Impact: code reorganization for later changes

With static map handling moved to pcpu_split_block(), pcpu_realloc()
only clutters the code and it's also unsuitable for scheduled locking
changes.  Implement and use pcpu_mem_alloc/free() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 85 ++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 42 insertions(+), 43 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index ef8e169b7731..f1d0e905850c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -164,39 +164,41 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 }
 
 /**
- * pcpu_realloc - versatile realloc
- * @p: the current pointer (can be NULL for new allocations)
- * @size: the current size in bytes (can be 0 for new allocations)
- * @new_size: the wanted new size in bytes (can be 0 for free)
+ * pcpu_mem_alloc - allocate memory
+ * @size: bytes to allocate
  *
- * More robust realloc which can be used to allocate, resize or free a
- * memory area of arbitrary size.  If the needed size goes over
- * PAGE_SIZE, kernel VM is used.
+ * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
+ * kzalloc() is used; otherwise, vmalloc() is used.  The returned
+ * memory is always zeroed.
  *
  * RETURNS:
- * The new pointer on success, NULL on failure.
+ * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+static void *pcpu_mem_alloc(size_t size)
 {
-	void *new;
-
-	if (new_size <= PAGE_SIZE)
-		new = kmalloc(new_size, GFP_KERNEL);
-	else
-		new = vmalloc(new_size);
-	if (new_size && !new)
-		return NULL;
-
-	memcpy(new, p, min(size, new_size));
-	if (new_size > size)
-		memset(new + size, 0, new_size - size);
+	if (size <= PAGE_SIZE)
+		return kzalloc(size, GFP_KERNEL);
+	else {
+		void *ptr = vmalloc(size);
+		if (ptr)
+			memset(ptr, 0, size);
+		return ptr;
+	}
+}
 
+/**
+ * pcpu_mem_free - free memory
+ * @ptr: memory to free
+ * @size: size of the area
+ *
+ * Free @ptr.  @ptr should have been allocated using pcpu_mem_alloc().
+ */
+static void pcpu_mem_free(void *ptr, size_t size)
+{
 	if (size <= PAGE_SIZE)
-		kfree(p);
+		kfree(ptr);
 	else
-		vfree(p);
-
-	return new;
+		vfree(ptr);
 }
 
 /**
@@ -331,29 +333,27 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 	if (chunk->map_alloc < target) {
 		int new_alloc;
 		int *new;
+		size_t size;
 
 		new_alloc = PCPU_DFL_MAP_ALLOC;
 		while (new_alloc < target)
 			new_alloc *= 2;
 
-		if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
-			/*
-			 * map_alloc smaller than the default size
-			 * indicates that the chunk is one of the
-			 * first chunks and still using static map.
-			 * Allocate a dynamic one and copy.
-			 */
-			new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
-			if (new)
-				memcpy(new, chunk->map,
-				       chunk->map_alloc * sizeof(new[0]));
-		} else
-			new = pcpu_realloc(chunk->map,
-					   chunk->map_alloc * sizeof(new[0]),
-					   new_alloc * sizeof(new[0]));
+		new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
 		if (!new)
 			return -ENOMEM;
 
+		size = chunk->map_alloc * sizeof(chunk->map[0]);
+		memcpy(new, chunk->map, size);
+
+		/*
+		 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the
+		 * chunk is one of the first chunks and still using
+		 * static map.
+		 */
+		if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+			pcpu_mem_free(chunk->map, size);
+
 		chunk->map_alloc = new_alloc;
 		chunk->map = new;
 	}
@@ -696,7 +696,7 @@ static void free_pcpu_chunk(struct pcpu_chunk *chunk)
 		return;
 	if (chunk->vm)
 		free_vm_area(chunk->vm);
-	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+	pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
 	kfree(chunk);
 }
 
@@ -708,8 +708,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 	if (!chunk)
 		return NULL;
 
-	chunk->map = pcpu_realloc(NULL, 0,
-				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
 	chunk->page = chunk->page_ar;
-- 
cgit v1.2.3


From 9f7dcf224bd09ec9ebcbfb383bf2c465e0e0b03d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:09 +0900
Subject: percpu: move chunk area map extension out of area allocation

Impact: code reorganization for later changes

Separate out chunk area map extension into a separate function -
pcpu_extend_area_map() - and call it directly from pcpu_alloc() such
that pcpu_alloc_area() is guaranteed to have enough area map slots on
invocation.

With this change, pcpu_alloc_area() does only area allocation and the
only failure mode is when the chunk doens't have enough room, so
there's no need to distinguish it from memory allocation failures.
Make it return -1 on such cases instead of hacky -ENOSPC.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 108 +++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 60 insertions(+), 48 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index f1d0e905850c..7d9bc35e8ed2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -306,6 +306,50 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 	rb_insert_color(&new->rb_node, &pcpu_addr_root);
 }
 
+/**
+ * pcpu_extend_area_map - extend area map for allocation
+ * @chunk: target chunk
+ *
+ * Extend area map of @chunk so that it can accomodate an allocation.
+ * A single allocation can split an area into three areas, so this
+ * function makes sure that @chunk->map has at least two extra slots.
+ *
+ * RETURNS:
+ * 0 if noop, 1 if successfully extended, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+{
+	int new_alloc;
+	int *new;
+	size_t size;
+
+	/* has enough? */
+	if (chunk->map_alloc >= chunk->map_used + 2)
+		return 0;
+
+	new_alloc = PCPU_DFL_MAP_ALLOC;
+	while (new_alloc < chunk->map_used + 2)
+		new_alloc *= 2;
+
+	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
+	if (!new)
+		return -ENOMEM;
+
+	size = chunk->map_alloc * sizeof(chunk->map[0]);
+	memcpy(new, chunk->map, size);
+
+	/*
+	 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
+	 * one of the first chunks and still using static map.
+	 */
+	if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+		pcpu_mem_free(chunk->map, size);
+
+	chunk->map_alloc = new_alloc;
+	chunk->map = new;
+	return 0;
+}
+
 /**
  * pcpu_split_block - split a map block
  * @chunk: chunk of interest
@@ -321,44 +365,16 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * depending on @head, is reduced by @tail bytes and @tail byte block
  * is inserted after the target block.
  *
- * RETURNS:
- * 0 on success, -errno on failure.
+ * @chunk->map must have enough free slots to accomodate the split.
  */
-static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
+			     int head, int tail)
 {
 	int nr_extra = !!head + !!tail;
-	int target = chunk->map_used + nr_extra;
-
-	/* reallocation required? */
-	if (chunk->map_alloc < target) {
-		int new_alloc;
-		int *new;
-		size_t size;
-
-		new_alloc = PCPU_DFL_MAP_ALLOC;
-		while (new_alloc < target)
-			new_alloc *= 2;
-
-		new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-		if (!new)
-			return -ENOMEM;
-
-		size = chunk->map_alloc * sizeof(chunk->map[0]);
-		memcpy(new, chunk->map, size);
-
-		/*
-		 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the
-		 * chunk is one of the first chunks and still using
-		 * static map.
-		 */
-		if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-			pcpu_mem_free(chunk->map, size);
 
-		chunk->map_alloc = new_alloc;
-		chunk->map = new;
-	}
+	BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
 
-	/* insert a new subblock */
+	/* insert new subblocks */
 	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 		sizeof(chunk->map[0]) * (chunk->map_used - i));
 	chunk->map_used += nr_extra;
@@ -371,7 +387,6 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 		chunk->map[i++] -= tail;
 		chunk->map[i] = tail;
 	}
-	return 0;
 }
 
 /**
@@ -384,8 +399,11 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
  * Note that this function only allocates the offset.  It doesn't
  * populate or map the area.
  *
+ * @chunk->map must have at least two free slots.
+ *
  * RETURNS:
- * Allocated offset in @chunk on success, -errno on failure.
+ * Allocated offset in @chunk on success, -1 if no matching area is
+ * found.
  */
 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 {
@@ -433,8 +451,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 
 		/* split if warranted */
 		if (head || tail) {
-			if (pcpu_split_block(chunk, i, head, tail))
-				return -ENOMEM;
+			pcpu_split_block(chunk, i, head, tail);
 			if (head) {
 				i++;
 				off += head;
@@ -461,14 +478,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 	chunk->contig_hint = max_contig;	/* fully scanned */
 	pcpu_chunk_relocate(chunk, oslot);
 
-	/*
-	 * Tell the upper layer that this chunk has no area left.
-	 * Note that this is not an error condition but a notification
-	 * to upper layer that it needs to look at other chunks.
-	 * -ENOSPC is chosen as it isn't used in memory subsystem and
-	 * matches the meaning in a way.
-	 */
-	return -ENOSPC;
+	/* tell the upper layer that this chunk has no matching area */
+	return -1;
 }
 
 /**
@@ -755,7 +766,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
-		if (size > chunk->contig_hint)
+		if (size > chunk->contig_hint ||
+		    pcpu_extend_area_map(chunk) < 0)
 			goto out_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
@@ -768,11 +780,11 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
+			if (pcpu_extend_area_map(chunk) < 0)
+				goto out_unlock;
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
-			if (off != -ENOSPC)
-				goto out_unlock;
 		}
 	}
 
-- 
cgit v1.2.3


From a56dbddf06b653ef9c04ca3767f260fd31ccebab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:11 +0900
Subject: percpu: move fully free chunk reclamation into a work

Impact: code reorganization for later changes

Do fully free chunk reclamation using a work.  This change is to
prepare for locking changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 7d9bc35e8ed2..4c8a419119da 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -63,6 +63,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/workqueue.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -118,6 +119,10 @@ static DEFINE_MUTEX(pcpu_mutex);
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
+/* reclaim work to release fully free chunks, scheduled from free path */
+static void pcpu_reclaim(struct work_struct *work);
+static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+
 static int __pcpu_size_to_slot(int size)
 {
 	int highbit = fls(size);	/* size is in bytes */
@@ -846,13 +851,37 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
 	return pcpu_alloc(size, align, true);
 }
 
-static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+/**
+ * pcpu_reclaim - reclaim fully free chunks, workqueue function
+ * @work: unused
+ *
+ * Reclaim all fully free chunks except for the first one.
+ */
+static void pcpu_reclaim(struct work_struct *work)
 {
-	WARN_ON(chunk->immutable);
-	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
-	list_del(&chunk->list);
-	rb_erase(&chunk->rb_node, &pcpu_addr_root);
-	free_pcpu_chunk(chunk);
+	LIST_HEAD(todo);
+	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
+	struct pcpu_chunk *chunk, *next;
+
+	mutex_lock(&pcpu_mutex);
+
+	list_for_each_entry_safe(chunk, next, head, list) {
+		WARN_ON(chunk->immutable);
+
+		/* spare the first one */
+		if (chunk == list_first_entry(head, struct pcpu_chunk, list))
+			continue;
+
+		rb_erase(&chunk->rb_node, &pcpu_addr_root);
+		list_move(&chunk->list, &todo);
+	}
+
+	mutex_unlock(&pcpu_mutex);
+
+	list_for_each_entry_safe(chunk, next, &todo, list) {
+		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+		free_pcpu_chunk(chunk);
+	}
 }
 
 /**
@@ -877,14 +906,13 @@ void free_percpu(void *ptr)
 
 	pcpu_free_area(chunk, off);
 
-	/* the chunk became fully free, kill one if there are other free ones */
+	/* if there are more than one fully free chunks, wake up grim reaper */
 	if (chunk->free_size == pcpu_unit_size) {
 		struct pcpu_chunk *pos;
 
-		list_for_each_entry(pos,
-				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
 			if (pos != chunk) {
-				pcpu_kill_chunk(pos);
+				schedule_work(&pcpu_reclaim_work);
 				break;
 			}
 	}
-- 
cgit v1.2.3


From ccea34b5d0fbab081496d1860f31acee99fa8a6d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:13 +0900
Subject: percpu: finer grained locking to break deadlock and allow atomic free

Impact: fix deadlock and allow atomic free

Percpu allocation always uses GFP_KERNEL and whole alloc/free paths
were protected by single mutex.  All percpu allocations have been from
GFP_KERNEL-safe context and the original allocator had this assumption
too.  However, by protecting both alloc and free paths with the same
mutex, the new allocator creates free -> alloc -> GFP_KERNEL
dependency which the original allocator didn't have.  This can lead to
deadlock if free is called from FS or IO paths.  Also, in general,
allocators are expected to allow free to be called from atomic
context.

This patch implements finer grained locking to break the deadlock and
allow atomic free.  For details, please read the "Synchronization
rules" comment.

While at it, also add CONTEXT: to function comments to describe which
context they expect to be called from and what they do to it.

This problem was reported by Thomas Gleixner and Peter Zijlstra.

  http://thread.gmane.org/gmane.linux.kernel/802384

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Peter Zijlstra <peterz@infradead.org>
---
 mm/percpu.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 124 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 4c8a419119da..bfe6a3afaf45 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
 #include <linux/pfn.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 /*
- * One mutex to rule them all.
- *
- * The following mutex is grabbed in the outermost public alloc/free
- * interface functions and released only when the operation is
- * complete.  As such, every function in this file other than the
- * outermost functions are called under pcpu_mutex.
- *
- * It can easily be switched to use spinlock such that only the area
- * allocation and page population commit are protected with it doing
- * actual [de]allocation without holding any lock.  However, given
- * what this allocator does, I think it's better to let them run
- * sequentially.
+ * Synchronization rules.
+ *
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
+ * The latter is a spinlock and protects the index data structures -
+ * chunk slots, rbtree, chunks and area maps in chunks.
+ *
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary.  All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context.  When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks.  Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
  */
-static DEFINE_MUTEX(pcpu_mutex);
+static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
  * kzalloc() is used; otherwise, vmalloc() is used.  The returned
  * memory is always zeroed.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
  * New slot according to the changed state is determined and @chunk is
  * moved to the slot.  Note that the reserved chunk is never put on
  * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
  * searchs for the chunk with the highest start address which isn't
  * beyond @addr.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * The address of the found chunk.
  */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  * @new: chunk to insert
  *
  * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 {
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * A single allocation can split an area into three areas, so this
  * function makes sure that @chunk->map has at least two extra slots.
  *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
  * RETURNS:
  * 0 if noop, 1 if successfully extended, -errno on failure.
  */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 	if (chunk->map_alloc >= chunk->map_used + 2)
 		return 0;
 
+	spin_unlock_irq(&pcpu_lock);
+
 	new_alloc = PCPU_DFL_MAP_ALLOC;
 	while (new_alloc < chunk->map_used + 2)
 		new_alloc *= 2;
 
 	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-	if (!new)
+	if (!new) {
+		spin_lock_irq(&pcpu_lock);
 		return -ENOMEM;
+	}
+
+	/*
+	 * Acquire pcpu_lock and switch to new area map.  Only free
+	 * could have happened inbetween, so map_used couldn't have
+	 * grown.
+	 */
+	spin_lock_irq(&pcpu_lock);
+	BUG_ON(new_alloc < chunk->map_used + 2);
 
 	size = chunk->map_alloc * sizeof(chunk->map[0]);
 	memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
  * is inserted after the target block.
  *
  * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 			     int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
  *
  * @chunk->map must have at least two free slots.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * Allocated offset in @chunk on success, -1 if no matching area is
  * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
  * Free area starting from @freeme to @chunk.  Note that this function
  * only modifies the allocation map.  It doesn't depopulate or unmap
  * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.  If @flush is true, vcache is flushed before unmapping
  * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 				  bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * @align: alignment of area (max PAGE_SIZE)
  * @reserved: allocate from the reserved chunk if available
  *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
+ * Allocate percpu area of @size bytes aligned at @align.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
-	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		return NULL;
 	}
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
 
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
 		if (size > chunk->contig_hint ||
 		    pcpu_extend_area_map(chunk) < 0)
-			goto out_unlock;
+			goto fail_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
 			goto area_found;
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
+restart:
 	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
-			if (pcpu_extend_area_map(chunk) < 0)
-				goto out_unlock;
+
+			switch (pcpu_extend_area_map(chunk)) {
+			case 0:
+				break;
+			case 1:
+				goto restart;	/* pcpu_lock dropped, restart */
+			default:
+				goto fail_unlock;
+			}
+
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	}
 
 	/* hmmm... no space left, create a new chunk */
+	spin_unlock_irq(&pcpu_lock);
+
 	chunk = alloc_pcpu_chunk();
 	if (!chunk)
-		goto out_unlock;
+		goto fail_unlock_mutex;
+
+	spin_lock_irq(&pcpu_lock);
 	pcpu_chunk_relocate(chunk, -1);
 	pcpu_chunk_addr_insert(chunk);
-
-	off = pcpu_alloc_area(chunk, size, align);
-	if (off < 0)
-		goto out_unlock;
+	goto restart;
 
 area_found:
+	spin_unlock_irq(&pcpu_lock);
+
 	/* populate, map and clear the area */
 	if (pcpu_populate_chunk(chunk, off, size)) {
+		spin_lock_irq(&pcpu_lock);
 		pcpu_free_area(chunk, off);
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
-	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
-out_unlock:
-	mutex_unlock(&pcpu_mutex);
-	return ptr;
+	mutex_unlock(&pcpu_alloc_mutex);
+
+	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
+
+fail_unlock:
+	spin_unlock_irq(&pcpu_lock);
+fail_unlock_mutex:
+	mutex_unlock(&pcpu_alloc_mutex);
+	return NULL;
 }
 
 /**
@@ -825,6 +897,9 @@ out_unlock:
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
  * percpu area if arch has set it up; otherwise, allocation is served
  * from the same dynamic area.  Might sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
  * @work: unused
  *
  * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
  */
 static void pcpu_reclaim(struct work_struct *work)
 {
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
 	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, head, list) {
 		WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
 		list_move(&chunk->list, &todo);
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irq(&pcpu_lock);
+	mutex_unlock(&pcpu_alloc_mutex);
 
 	list_for_each_entry_safe(chunk, next, &todo, list) {
 		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
  *
- * Free percpu area @ptr.  Might sleep.
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
  */
 void free_percpu(void *ptr)
 {
 	void *addr = __pcpu_ptr_to_addr(ptr);
 	struct pcpu_chunk *chunk;
+	unsigned long flags;
 	int off;
 
 	if (!ptr)
 		return;
 
-	mutex_lock(&pcpu_mutex);
+	spin_lock_irqsave(&pcpu_lock, flags);
 
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
 			}
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irqrestore(&pcpu_lock, flags);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
-- 
cgit v1.2.3


From e01009833e22dc87075d770554b34d797843ed23 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2009 16:27:48 +0900
Subject: percpu: make x86 addr <-> pcpu ptr conversion macros generic

Impact: generic addr <-> pcpu ptr conversion macros

There's nothing arch specific about x86 __addr_to_pcpu_ptr() and
__pcpu_ptr_to_addr().  With proper __per_cpu_load and __per_cpu_start
defined, they'll do the right thing regardless of actual layout.

Move these macros from arch/x86/include/asm/percpu.h to mm/percpu.c
and allow archs to override it as necessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/percpu.h |  8 --------
 mm/percpu.c                   | 16 +++++++++++++++-
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 8f1d2fbec1d4..aee103b26d01 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -43,14 +43,6 @@
 #else /* ...!ASSEMBLY */
 
 #include <linux/stringify.h>
-#include <asm/sections.h>
-
-#define __addr_to_pcpu_ptr(addr)					\
-	(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr	\
-		 + (unsigned long)__per_cpu_start)
-#define __pcpu_ptr_to_addr(ptr)						\
-	(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr	\
-		 - (unsigned long)__per_cpu_start)
 
 #ifdef CONFIG_SMP
 #define __percpu_arg(x)		"%%"__stringify(__percpu_seg)":%P" #x
diff --git a/mm/percpu.c b/mm/percpu.c
index bfe6a3afaf45..c6f38a2aface 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -46,7 +46,8 @@
  * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
  *
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
- *   regular address to percpu pointer and back
+ *   regular address to percpu pointer and back if they need to be
+ *   different from the default
  *
  * - use pcpu_setup_first_chunk() during percpu area initialization to
  *   setup the first chunk containing the kernel static percpu area
@@ -67,11 +68,24 @@
 #include <linux/workqueue.h>
 
 #include <asm/cacheflush.h>
+#include <asm/sections.h>
 #include <asm/tlbflush.h>
 
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 
+/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
+#ifndef __addr_to_pcpu_ptr
+#define __addr_to_pcpu_ptr(addr)					\
+	(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr	\
+		 + (unsigned long)__per_cpu_start)
+#endif
+#ifndef __pcpu_ptr_to_addr
+#define __pcpu_ptr_to_addr(ptr)						\
+	(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr	\
+		 - (unsigned long)__per_cpu_start)
+#endif
+
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
 	struct rb_node		rb_node;	/* key is chunk->vm->addr */
-- 
cgit v1.2.3


From 6074d5b0a319fe8400ff079a3c289406ca024321 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2009 16:27:48 +0900
Subject: percpu: more flexibility for @dyn_size of pcpu_setup_first_chunk()

Impact: cleanup, more flexibility for first chunk init

Non-negative @dyn_size used to be allowed iff @unit_size wasn't auto.
This restriction stemmed from implementation detail and made things a
bit less intuitive.  This patch allows @dyn_size to be specified
regardless of @unit_size and swaps the positions of @dyn_size and
@unit_size so that the parameter order makes more sense (static,
reserved and dyn sizes followed by enclosing unit_size).

While at it, add @unit_size >= PCPU_MIN_UNIT_SIZE sanity check.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 13 ++++++-------
 include/linux/percpu.h         |  2 +-
 mm/percpu.c                    | 28 ++++++++++++++--------------
 3 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index efa615f2bf43..e41c51f6ada1 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -233,8 +233,8 @@ proceed:
 		"%zu bytes\n", vm.addr, static_size);
 
 	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
-				     PERCPU_FIRST_CHUNK_RESERVE,
-				     PMD_SIZE, dyn_size, vm.addr, NULL);
+				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				     PMD_SIZE, vm.addr, NULL);
 	goto out_free_ar;
 
 enomem:
@@ -315,9 +315,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
 	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      PERCPU_FIRST_CHUNK_RESERVE,
-				      pcpue_unit_size, dyn_size,
-				      pcpue_ptr, NULL);
+				      PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				      pcpue_unit_size, pcpue_ptr, NULL);
 }
 
 /*
@@ -375,8 +374,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 		pcpu4k_nr_static_pages, static_size);
 
 	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
-				     PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
-				     pcpu4k_populate_pte);
+				     PERCPU_FIRST_CHUNK_RESERVE, -1,
+				     -1, NULL, pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 54a968b4b924..fb455dcc59c7 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -107,7 +107,7 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				size_t static_size, size_t reserved_size,
-				ssize_t unit_size, ssize_t dyn_size,
+				ssize_t dyn_size, ssize_t unit_size,
 				void *base_addr,
 				pcpu_populate_pte_fn_t populate_pte_fn);
 
diff --git a/mm/percpu.c b/mm/percpu.c
index c6f38a2aface..2f94661d3e36 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1027,8 +1027,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -1053,14 +1053,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * limited offset range for symbol relocations to guarantee module
  * percpu symbols fall inside the relocatable range.
  *
+ * @dyn_size, if non-negative, determines the number of bytes
+ * available for dynamic allocation in the first chunk.  Specifying
+ * non-negative value makes percpu leave alone the area beyond
+ * @static_size + @reserved_size + @dyn_size.
+ *
  * @unit_size, if non-negative, specifies unit size and must be
  * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @reserved_size + @dyn_size.
- *
- * @dyn_size, if non-negative, limits the number of bytes available
- * for dynamic allocation in the first chunk.  Specifying non-negative
- * value make percpu leave alone the area beyond @static_size +
- * @reserved_size + @dyn_size.
+ * @reserved_size + if non-negative, @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -1083,12 +1083,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t reserved_size,
-				     ssize_t unit_size, ssize_t dyn_size,
+				     ssize_t dyn_size, ssize_t unit_size,
 				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
+	size_t size_sum = static_size + reserved_size +
+			  (dyn_size >= 0 ? dyn_size : 0);
 	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
 	int nr_pages;
@@ -1099,20 +1101,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	if (unit_size >= 0) {
-		BUG_ON(unit_size < static_size + reserved_size +
-				   (dyn_size >= 0 ? dyn_size : 0));
+		BUG_ON(unit_size < size_sum);
 		BUG_ON(unit_size & ~PAGE_MASK);
-	} else {
-		BUG_ON(dyn_size >= 0);
+		BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+	} else
 		BUG_ON(base_addr);
-	}
 	BUG_ON(base_addr && populate_pte_fn);
 
 	if (unit_size >= 0)
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(static_size + reserved_size));
+					PFN_UP(size_sum));
 
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
-- 
cgit v1.2.3


From 66c3a75772247c31feabefb724e082220a1ab060 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2009 16:27:48 +0900
Subject: percpu: generalize embedding first chunk setup helper

Impact: code reorganization

Separate out embedding first chunk setup helper from x86 embedding
first chunk allocator and put it in mm/percpu.c.  This will be used by
the default percpu first chunk allocator and possibly by other archs.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 54 +++-----------------------
 include/linux/percpu.h         |  4 ++
 mm/percpu.c                    | 86 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 48 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index e41c51f6ada1..400331b50a53 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -257,31 +257,13 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
  * Embedding allocator
  *
  * The first chunk is sized to just contain the static area plus
- * module and dynamic reserves, and allocated as a contiguous area
- * using bootmem allocator and used as-is without being mapped into
- * vmalloc area.  This enables the first chunk to piggy back on the
- * linear physical PMD mapping and doesn't add any additional pressure
- * to TLB.  Note that if the needed size is smaller than the minimum
- * unit size, the leftover is returned to the bootmem allocator.
+ * module and dynamic reserves and embedded into linear physical
+ * mapping so that it can use PMD mapping without additional TLB
+ * pressure.
  */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_size __initdata;
-static size_t pcpue_unit_size __initdata;
-
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpue_size)
-		return NULL;
-
-	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
-}
-
 static ssize_t __init setup_pcpu_embed(size_t static_size)
 {
-	unsigned int cpu;
-	size_t dyn_size;
+	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
 	/*
 	 * If large page isn't supported, there's no benefit in doing
@@ -291,32 +273,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	if (!cpu_has_pse || pcpu_need_numa())
 		return -EINVAL;
 
-	/* allocate and copy */
-	pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
-			       PERCPU_DYNAMIC_RESERVE);
-	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-	dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
-	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
-				       PAGE_SIZE);
-	if (!pcpue_ptr)
-		return -ENOMEM;
-
-	for_each_possible_cpu(cpu) {
-		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
-
-		free_bootmem(__pa(ptr + pcpue_size),
-			     pcpue_unit_size - pcpue_size);
-		memcpy(ptr, __per_cpu_load, static_size);
-	}
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
-		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
-
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				      pcpue_unit_size, pcpue_ptr, NULL);
+	return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
 }
 
 /*
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index fb455dcc59c7..ee5615d65211 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -111,6 +111,10 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				void *base_addr,
 				pcpu_populate_pte_fn_t populate_pte_fn);
 
+extern ssize_t __init pcpu_embed_first_chunk(
+				size_t static_size, size_t reserved_size,
+				ssize_t dyn_size, ssize_t unit_size);
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index 2f94661d3e36..1aa5d8fbca12 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1238,3 +1238,89 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 	return pcpu_unit_size;
 }
+
+/*
+ * Embedding first chunk setup helper.
+ */
+static void *pcpue_ptr __initdata;
+static size_t pcpue_size __initdata;
+static size_t pcpue_unit_size __initdata;
+
+static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpue_size)
+		return NULL;
+
+	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
+}
+
+/**
+ * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * If this function is used to setup the first chunk, it is allocated
+ * as a contiguous area using bootmem allocator and used as-is without
+ * being mapped into vmalloc area.  This enables the first chunk to
+ * piggy back on the linear physical mapping which often uses larger
+ * page size.
+ *
+ * When @dyn_size is positive, dynamic area might be larger than
+ * specified to fill page alignment.  Also, when @dyn_size is auto,
+ * @dyn_size does not fill the whole first chunk but only what's
+ * necessary for page alignment after static and reserved areas.
+ *
+ * If the needed size is smaller than the minimum or specified unit
+ * size, the leftover is returned to the bootmem allocator.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
+				      ssize_t dyn_size, ssize_t unit_size)
+{
+	unsigned int cpu;
+
+	/* determine parameters and allocate */
+	pcpue_size = PFN_ALIGN(static_size + reserved_size +
+			       (dyn_size >= 0 ? dyn_size : 0));
+	if (dyn_size != 0)
+		dyn_size = pcpue_size - static_size - reserved_size;
+
+	if (unit_size >= 0) {
+		BUG_ON(unit_size < pcpue_size);
+		pcpue_unit_size = unit_size;
+	} else
+		pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
+
+	pcpue_ptr = __alloc_bootmem_nopanic(
+					num_possible_cpus() * pcpue_unit_size,
+					PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+	if (!pcpue_ptr)
+		return -ENOMEM;
+
+	/* return the leftover and copy */
+	for_each_possible_cpu(cpu) {
+		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+
+		free_bootmem(__pa(ptr + pcpue_size),
+			     pcpue_unit_size - pcpue_size);
+		memcpy(ptr, __per_cpu_load, static_size);
+	}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
+		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+				      reserved_size, dyn_size,
+				      pcpue_unit_size, pcpue_ptr, NULL);
+}
-- 
cgit v1.2.3


From 60db56422043aaa455ac7f858ce23c273220f9d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Mar 2009 14:36:54 +0900
Subject: percpu: fix spurious alignment WARN in legacy SMP percpu allocator

Impact: remove spurious WARN on legacy SMP percpu allocator

Commit f2a8205c4ef1af917d175c36a4097ae5587791c8 incorrectly added too
tight WARN_ON_ONCE() on alignments for UP and legacy SMP percpu
allocator.  Commit e317603694bfd17b28a40de9d65e1a4ec12f816e fixed it
for UP but legacy SMP allocator was forgotten.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sachin P. Sant <sachinp@in.ibm.com>
---
 mm/allocpercpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 3653c570232b..1882923bc706 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -120,7 +120,7 @@ void *__alloc_percpu(size_t size, size_t align)
 	 * on it.  Larger alignment should only be used for module
 	 * percpu sections on SMP for which this path isn't used.
 	 */
-	WARN_ON_ONCE(align > __alignof__(unsigned long long));
+	WARN_ON_ONCE(align > SMP_CACHE_BYTES);
 
 	if (unlikely(!pdata))
 		return NULL;
-- 
cgit v1.2.3


From 4bb9c5c02153dfc89a6c73a6f32091413805ad7d Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Thu, 12 Mar 2009 17:45:27 -0700
Subject: VM, x86, PAT: Change is_linear_pfn_mapping to not use vm_pgoff

Impact: fix false positive PAT warnings - also fix VirtalBox hang

Use of vma->vm_pgoff to identify the pfnmaps that are fully
mapped at mmap time is broken. vm_pgoff is set by generic mmap
code even for cases where drivers are setting up the mappings
at the fault time.

The problem was originally reported here:

 http://marc.info/?l=linux-kernel&m=123383810628583&w=2

Change is_linear_pfn_mapping logic to overload VM_INSERTPAGE
flag along with VM_PFNMAP to mean full PFNMAP setup at mmap
time.

Problem also tracked at:

 http://bugzilla.kernel.org/show_bug.cgi?id=12800

Reported-by: Thomas Hellstrom <thellstrom@vmware.com>
Tested-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha>@intel.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: "ebiederm@xmission.com" <ebiederm@xmission.com>
Cc: <stable@kernel.org> # only for 2.6.29.1, not .28
LKML-Reference: <20090313004527.GA7176@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c  |  5 +++--
 include/linux/mm.h | 15 +++++++++++++--
 mm/memory.c        |  6 ++++--
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e0ab173b6974..21bc1f787ae2 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -641,10 +641,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 
 	/*
-	 * reserve_pfn_range() doesn't support RAM pages.
+	 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
+	 * behavior with RAM pages by returning success.
 	 */
 	if (is_ram != 0)
-		return -EINVAL;
+		return 0;
 
 	ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
 	if (ret)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065cdf8c09fb..3daa05feed9f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -98,7 +98,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
-#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
+#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it. Refer note in VM_PFNMAP_AT_MMAP below */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
@@ -126,6 +126,17 @@ extern unsigned int kobjsize(const void *objp);
  */
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 
+/*
+ * pfnmap vmas that are fully mapped at mmap time (not mapped on fault).
+ * Used by x86 PAT to identify such PFNMAP mappings and optimize their handling.
+ * Note VM_INSERTPAGE flag is overloaded here. i.e,
+ * VM_INSERTPAGE && !VM_PFNMAP implies
+ *     The vma has had "vm_insert_page()" done on it
+ * VM_INSERTPAGE && VM_PFNMAP implies
+ *     The vma is PFNMAP with full mapping at mmap time
+ */
+#define VM_PFNMAP_AT_MMAP (VM_INSERTPAGE | VM_PFNMAP)
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
@@ -145,7 +156,7 @@ extern pgprot_t protection_map[16];
  */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-	return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff);
+	return ((vma->vm_flags & VM_PFNMAP_AT_MMAP) == VM_PFNMAP_AT_MMAP);
 }
 
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd2..d7df5babcba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1665,9 +1665,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * behaviour that some programs depend on. We mark the "original"
 	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
 	 */
-	if (addr == vma->vm_start && end == vma->vm_end)
+	if (addr == vma->vm_start && end == vma->vm_end) {
 		vma->vm_pgoff = pfn;
-	else if (is_cow_mapping(vma->vm_flags))
+		vma->vm_flags |= VM_PFNMAP_AT_MMAP;
+	} else if (is_cow_mapping(vma->vm_flags))
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
@@ -1679,6 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		 * needed from higher level routine calling unmap_vmas
 		 */
 		vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
+		vma->vm_flags &= ~VM_PFNMAP_AT_MMAP;
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 895791dac6946d535991edd11341046f8e85ea77 Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Fri, 13 Mar 2009 16:35:44 -0700
Subject: VM, x86, PAT: add a new vm flag to track full pfnmap at mmap

Impact: cleanup

Add a new vm flag VM_PFN_AT_MMAP to identify a PFNMAP that is
fully mapped with remap_pfn_range. Patch removes the overloading
of VM_INSERTPAGE from the earlier patch.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Nick Piggin <npiggin@suse.de>
LKML-Reference: <20090313233543.GA19909@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mm.h | 16 +++-------------
 mm/memory.c        |  4 ++--
 2 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3daa05feed9f..b1ea37fc7a24 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -98,12 +98,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
-#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it. Refer note in VM_PFNMAP_AT_MMAP below */
+#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
+#define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -126,17 +127,6 @@ extern unsigned int kobjsize(const void *objp);
  */
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 
-/*
- * pfnmap vmas that are fully mapped at mmap time (not mapped on fault).
- * Used by x86 PAT to identify such PFNMAP mappings and optimize their handling.
- * Note VM_INSERTPAGE flag is overloaded here. i.e,
- * VM_INSERTPAGE && !VM_PFNMAP implies
- *     The vma has had "vm_insert_page()" done on it
- * VM_INSERTPAGE && VM_PFNMAP implies
- *     The vma is PFNMAP with full mapping at mmap time
- */
-#define VM_PFNMAP_AT_MMAP (VM_INSERTPAGE | VM_PFNMAP)
-
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
@@ -156,7 +146,7 @@ extern pgprot_t protection_map[16];
  */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-	return ((vma->vm_flags & VM_PFNMAP_AT_MMAP) == VM_PFNMAP_AT_MMAP);
+	return (vma->vm_flags & VM_PFN_AT_MMAP);
 }
 
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
diff --git a/mm/memory.c b/mm/memory.c
index d7df5babcba9..2032ad2fc34b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1667,7 +1667,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 */
 	if (addr == vma->vm_start && end == vma->vm_end) {
 		vma->vm_pgoff = pfn;
-		vma->vm_flags |= VM_PFNMAP_AT_MMAP;
+		vma->vm_flags |= VM_PFN_AT_MMAP;
 	} else if (is_cow_mapping(vma->vm_flags))
 		return -EINVAL;
 
@@ -1680,7 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		 * needed from higher level routine calling unmap_vmas
 		 */
 		vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
-		vma->vm_flags &= ~VM_PFNMAP_AT_MMAP;
+		vma->vm_flags &= ~VM_PFN_AT_MMAP;
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 1d885526f2f3fffacee2ecb541270bd00168adff Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Fri, 13 Mar 2009 13:52:00 -0700
Subject: vmscan: pgmoved should be cleared after updating recent_rotated

pgmoved should be cleared after updating recent_rotated.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e89517141657..56ddf41149eb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1262,7 +1262,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 * Move the pages to the [file or anon] inactive list.
 	 */
 	pagevec_init(&pvec, 1);
-	pgmoved = 0;
 	lru = LRU_BASE + file * LRU_FILE;
 
 	spin_lock_irq(&zone->lru_lock);
@@ -1274,6 +1273,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 */
 	reclaim_stat->recent_rotated[!!file] += pgmoved;
 
+	pgmoved = 0;
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
-- 
cgit v1.2.3


From 3297e760776af18a26bf30046cbaaae2e730c5c2 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Wed, 4 Mar 2009 22:49:41 -0500
Subject: highmem: atomic highmem kmap page pinning

Most ARM machines have a non IO coherent cache, meaning that the
dma_map_*() set of functions must clean and/or invalidate the affected
memory manually before DMA occurs.  And because the majority of those
machines have a VIVT cache, the cache maintenance operations must be
performed using virtual
addresses.

When a highmem page is kunmap'd, its mapping (and cache) remains in place
in case it is kmap'd again. However if dma_map_page() is then called with
such a page, some cache maintenance on the remaining mapping must be
performed. In that case, page_address(page) is non null and we can use
that to synchronize the cache.

It is unlikely but still possible for kmap() to race and recycle the
virtual address obtained above, and use it for another page before some
on-going cache invalidation loop in dma_map_page() is done. In that case,
the new mapping could end up with dirty cache lines for another page,
and the unsuspecting cache invalidation loop in dma_map_page() might
simply discard those dirty cache lines resulting in data loss.

For example, let's consider this sequence of events:

	- dma_map_page(..., DMA_FROM_DEVICE) is called on a highmem page.

	-->	- vaddr = page_address(page) is non null. In this case
		it is likely that the page has valid cache lines
		associated with vaddr. Remember that the cache is VIVT.

		-->	for (i = vaddr; i < vaddr + PAGE_SIZE; i += 32)
				invalidate_cache_line(i);

	*** preemption occurs in the middle of the loop above ***

	- kmap_high() is called for a different page.

	-->	- last_pkmap_nr wraps to zero and flush_all_zero_pkmaps()
		  is called.  The pkmap_count value for the page passed
		  to dma_map_page() above happens to be 1, so the page
		  is unmapped.  But prior to that, flush_cache_kmaps()
		  cleared the cache for it.  So far so good.

		- A fresh pkmap entry is assigned for this kmap request.
		  The Murphy law says this pkmap entry will eventually
		  happen to use the same vaddr as the one which used to
		  belong to the other page being processed by
		  dma_map_page() in the preempted thread above.

	- The kmap_high() caller start dirtying the cache using the
	  just assigned virtual mapping for its page.

	*** the first thread is rescheduled ***

			- The for(...) loop is resumed, but now cached
			  data belonging to a different physical page is
			  being discarded !

And this is not only a preemption issue as ARM can be SMP as well,
making the above scenario just as likely. Hence the need for some kind
of pkmap page pinning which can be used in any context, primarily for
the benefit of dma_map_page() on ARM.

This provides the necessary interface to cope with the above issue if
ARCH_NEEDS_KMAP_HIGH_GET is defined, otherwise the resulting code is
unchanged.

Signed-off-by: Nicolas Pitre <nico@marvell.com>
Reviewed-by: MinChan Kim <minchan.kim@gmail.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/highmem.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/highmem.c b/mm/highmem.c
index b36b83b920ff..910198037bf5 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -67,6 +67,25 @@ pte_t * pkmap_page_table;
 
 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
 
+/*
+ * Most architectures have no use for kmap_high_get(), so let's abstract
+ * the disabling of IRQ out of the locking in that case to save on a
+ * potential useless overhead.
+ */
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+#define lock_kmap()             spin_lock_irq(&kmap_lock)
+#define unlock_kmap()           spin_unlock_irq(&kmap_lock)
+#define lock_kmap_any(flags)    spin_lock_irqsave(&kmap_lock, flags)
+#define unlock_kmap_any(flags)  spin_unlock_irqrestore(&kmap_lock, flags)
+#else
+#define lock_kmap()             spin_lock(&kmap_lock)
+#define unlock_kmap()           spin_unlock(&kmap_lock)
+#define lock_kmap_any(flags)    \
+		do { spin_lock(&kmap_lock); (void)(flags); } while (0)
+#define unlock_kmap_any(flags)  \
+		do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
+#endif
+
 static void flush_all_zero_pkmaps(void)
 {
 	int i;
@@ -113,9 +132,9 @@ static void flush_all_zero_pkmaps(void)
  */
 void kmap_flush_unused(void)
 {
-	spin_lock(&kmap_lock);
+	lock_kmap();
 	flush_all_zero_pkmaps();
-	spin_unlock(&kmap_lock);
+	unlock_kmap();
 }
 
 static inline unsigned long map_new_virtual(struct page *page)
@@ -145,10 +164,10 @@ start:
 
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&pkmap_map_wait, &wait);
-			spin_unlock(&kmap_lock);
+			unlock_kmap();
 			schedule();
 			remove_wait_queue(&pkmap_map_wait, &wait);
-			spin_lock(&kmap_lock);
+			lock_kmap();
 
 			/* Somebody else might have mapped it while we slept */
 			if (page_address(page))
@@ -184,29 +203,59 @@ void *kmap_high(struct page *page)
 	 * For highmem pages, we can't trust "virtual" until
 	 * after we have the lock.
 	 */
-	spin_lock(&kmap_lock);
+	lock_kmap();
 	vaddr = (unsigned long)page_address(page);
 	if (!vaddr)
 		vaddr = map_new_virtual(page);
 	pkmap_count[PKMAP_NR(vaddr)]++;
 	BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
-	spin_unlock(&kmap_lock);
+	unlock_kmap();
 	return (void*) vaddr;
 }
 
 EXPORT_SYMBOL(kmap_high);
 
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+/**
+ * kmap_high_get - pin a highmem page into memory
+ * @page: &struct page to pin
+ *
+ * Returns the page's current virtual memory address, or NULL if no mapping
+ * exists.  When and only when a non null address is returned then a
+ * matching call to kunmap_high() is necessary.
+ *
+ * This can be called from any context.
+ */
+void *kmap_high_get(struct page *page)
+{
+	unsigned long vaddr, flags;
+
+	lock_kmap_any(flags);
+	vaddr = (unsigned long)page_address(page);
+	if (vaddr) {
+		BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1);
+		pkmap_count[PKMAP_NR(vaddr)]++;
+	}
+	unlock_kmap_any(flags);
+	return (void*) vaddr;
+}
+#endif
+
 /**
  * kunmap_high - map a highmem page into memory
  * @page: &struct page to unmap
+ *
+ * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
+ * only from user context.
  */
 void kunmap_high(struct page *page)
 {
 	unsigned long vaddr;
 	unsigned long nr;
+	unsigned long flags;
 	int need_wakeup;
 
-	spin_lock(&kmap_lock);
+	lock_kmap_any(flags);
 	vaddr = (unsigned long)page_address(page);
 	BUG_ON(!vaddr);
 	nr = PKMAP_NR(vaddr);
@@ -232,7 +281,7 @@ void kunmap_high(struct page *page)
 		 */
 		need_wakeup = waitqueue_active(&pkmap_map_wait);
 	}
-	spin_unlock(&kmap_lock);
+	unlock_kmap_any(flags);
 
 	/* do wake-up, if needed, race-free outside of the spin lock */
 	if (need_wakeup)
-- 
cgit v1.2.3


From 1a00df4a2cc001dd9f45890e690548c24b2fa2d9 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Sat, 7 Mar 2009 00:36:21 +0900
Subject: slub: use get_track()

Use get_track() in set_track()

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index f21e25ad453b..e150b5c0424f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,14 +374,8 @@ static struct track *get_track(struct kmem_cache *s, void *object,
 static void set_track(struct kmem_cache *s, void *object,
 			enum track_item alloc, unsigned long addr)
 {
-	struct track *p;
-
-	if (s->offset)
-		p = object + s->offset + sizeof(void *);
-	else
-		p = object + s->inuse;
+	struct track *p = get_track(s, object, alloc);
 
-	p += alloc;
 	if (addr) {
 		p->addr = addr;
 		p->cpu = smp_processor_id();
-- 
cgit v1.2.3


From 6fb8f424393025674fde7869b59f485d1e352182 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 16 Mar 2009 21:00:28 +1100
Subject: slob: fix lockup in slob_free()

Don't hold SLOB lock when freeing the page. Reduces lock hold width. See
the following thread for discussion of the bug:

  http://marc.info/?l=linux-kernel&m=123709983214143&w=2

Reported-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slob.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index bf7e8fc3aed8..f901653707a4 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -393,10 +393,11 @@ static void slob_free(void *block, int size)
 		/* Go directly to page allocator. Do not pass slob allocator */
 		if (slob_page_free(sp))
 			clear_slob_page_free(sp);
+		spin_unlock_irqrestore(&slob_lock, flags);
 		clear_slob_page(sp);
 		free_slob_page(sp);
 		free_page((unsigned long)b);
-		goto out;
+		return;
 	}
 
 	if (!slob_page_free(sp)) {
-- 
cgit v1.2.3


From 26160158d3d3df548f4ee046cc6147fe048cfa9c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 17 Mar 2009 09:35:06 +0100
Subject: Move the default_backing_dev_info out of readahead.c and into
 backing-dev.c

It really makes no sense to have it in readahead.c, so move it where
it belongs.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 mm/backing-dev.c | 26 +++++++++++++++++++++++++-
 mm/readahead.c   | 25 -------------------------
 2 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8e8587444132..be68c956a660 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -2,11 +2,24 @@
 #include <linux/wait.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
 
+void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+
+struct backing_dev_info default_backing_dev_info = {
+	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
+	.state		= 0,
+	.capabilities	= BDI_CAP_MAP_COPY,
+	.unplug_io_fn	= default_unplug_io_fn,
+};
+EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 static struct class *bdi_class;
 
@@ -166,9 +179,20 @@ static __init int bdi_class_init(void)
 	bdi_debug_init();
 	return 0;
 }
-
 postcore_initcall(bdi_class_init);
 
+static int __init default_bdi_init(void)
+{
+	int err;
+
+	err = bdi_init(&default_backing_dev_info);
+	if (!err)
+		bdi_register(&default_backing_dev_info, NULL, "default");
+
+	return err;
+}
+subsys_initcall(default_bdi_init);
+
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...)
 {
diff --git a/mm/readahead.c b/mm/readahead.c
index bec83c15a78f..9ce303d4b810 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,19 +17,6 @@
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
 
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-EXPORT_SYMBOL(default_unplug_io_fn);
-
-struct backing_dev_info default_backing_dev_info = {
-	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
-	.state		= 0,
-	.capabilities	= BDI_CAP_MAP_COPY,
-	.unplug_io_fn	= default_unplug_io_fn,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
-
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
  * memset *ra to zero.
@@ -233,18 +220,6 @@ unsigned long max_sane_readahead(unsigned long nr)
 		+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
 }
 
-static int __init readahead_init(void)
-{
-	int err;
-
-	err = bdi_init(&default_backing_dev_info);
-	if (!err)
-		bdi_register(&default_backing_dev_info, NULL, "default");
-
-	return err;
-}
-subsys_initcall(readahead_init);
-
 /*
  * Submit IO for the read-ahead request in file_ra_state.
  */
-- 
cgit v1.2.3


From 1b5e62b42b55c509eea04c3c0f25e42c8b35b564 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 23 Mar 2009 08:57:38 +0800
Subject: writeback: double the dirty thresholds

Enlarge default dirty ratios from 5/10 to 10/20.  This fixes [Bug
#12809] iozone regression with 2.6.29-rc6.

The iozone benchmarks are performed on a 1200M file, with 8GB ram.

  iozone -i 0 -i 1 -i 2 -i 3 -i 4 -r 4k -s 64k -s 512m -s 1200m -b tmp.xls
  iozone -B -r 4k -s 64k -s 512m -s 1200m -b tmp.xls

The performance regression is triggered by commit 1cf6e7d83bf3(mm: task
dirty accounting fix), which makes more correct/thorough dirty
accounting.

The default 5/10 dirty ratios were picked (a) with the old dirty logic
and (b) largely at random and (c) designed to be aggressive.  In
particular, that (a) means that having fixed some of the dirty
accounting, maybe the real bug is now that it was always too aggressive,
just hidden by an accounting issue.

The enlarged 10/20 dirty ratios are just about enough to fix the regression.

[ We will have to look at how this affects the old fsync() latency issue,
  but that probably will need independent work.  - Linus ]

Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reported-by: "Lin, Ming M" <ming.m.lin@intel.com>
Tested-by: "Lin, Ming M" <ming.m.lin@intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 74dc57c74349..40ca7cdb653e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void)
 /*
  * Start background writeback (via pdflush) at this percentage
  */
-int dirty_background_ratio = 5;
+int dirty_background_ratio = 10;
 
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -83,7 +83,7 @@ int vm_highmem_is_dirtyable;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
-int vm_dirty_ratio = 10;
+int vm_dirty_ratio = 20;
 
 /*
  * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
-- 
cgit v1.2.3


From 1a2142afa5646ad5af44bbe1febaa5e0b7e71156 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:10 -0600
Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR, &CPU_MASK_ALL

Impact: cleanup

(Thanks to Al Viro for reminding me of this, via Ingo)

CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so:

	#define CPU_MASK_ALL (cpumask_t) { { ... } }

Taking the address of such a temporary is questionable at best,
unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added
CPU_MASK_ALL_PTR:

	#define CPU_MASK_ALL_PTR (&CPU_MASK_ALL)

Which formalizes this practice.  One day gcc could bite us over this
usage (though we seem to have gotten away with it so far).

So replace everywhere which used &CPU_MASK_ALL or CPU_MASK_ALL_PTR
with the modern "cpu_all_mask" (a real const struct cpumask *).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
---
 init/main.c      | 2 +-
 kernel/kmod.c    | 2 +-
 kernel/kthread.c | 4 ++--
 mm/pdflush.c     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/init/main.c b/init/main.c
index 6bf83afd654d..1ac7ec78e601 100644
--- a/init/main.c
+++ b/init/main.c
@@ -842,7 +842,7 @@ static int __init kernel_init(void * unused)
 	/*
 	 * init can run on any cpu.
 	 */
-	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(current, cpu_all_mask);
 	/*
 	 * Tell the world that we're going to be the grim
 	 * reaper of innocent orphaned children.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a27a5f64443d..f0c8f545180d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -167,7 +167,7 @@ static int ____call_usermodehelper(void *data)
 	}
 
 	/* We can run anywhere, unlike our parent keventd(). */
-	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(current, cpu_all_mask);
 
 	/*
 	 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456f393d..84bbadd4d021 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)
 		 */
 		sched_setscheduler(create->result, SCHED_NORMAL, &param);
 		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-		set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
+		set_cpus_allowed_ptr(create->result, cpu_all_mask);
 	}
 	complete(&create->done);
 }
@@ -240,7 +240,7 @@ int kthreadd(void *unused)
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
-	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(tsk, cpu_all_mask);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 15de509b68fd..118905e3d788 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -191,7 +191,7 @@ static int pdflush(void *dummy)
 
 	/*
 	 * Some configs put our parent kthread in a limited cpuset,
-	 * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
+	 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
 	 * Our needs are more modest - cut back to our cpusets cpus_allowed.
 	 * This is needed as pdflush's are dynamically created and destroyed.
 	 * The boottime pdflush's are easily placed w/o these 2 lines.
-- 
cgit v1.2.3


From aa85ea5b89c36c51200d795dd788139bd9b8cf50 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:15 -0600
Subject: cpumask: use new cpumask_ functions in core code.

Impact: cleanup

Time to clean up remaining laggards using the old cpu_ functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Trond.Myklebust@netapp.com
---
 drivers/base/cpu.c     | 2 +-
 include/linux/cpuset.h | 4 ++--
 kernel/workqueue.c     | 6 +++---
 mm/allocpercpu.c       | 2 +-
 mm/vmstat.c            | 2 +-
 net/sunrpc/svc.c       | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 5b257a57bc57..e62a4ccea54d 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -119,7 +119,7 @@ static ssize_t print_cpus_map(char *buf, const struct cpumask *map)
 #define	print_cpus_func(type) \
 static ssize_t print_cpus_##type(struct sysdev_class *class, char *buf)	\
 {									\
-	return print_cpus_map(buf, &cpu_##type##_map);			\
+	return print_cpus_map(buf, cpu_##type##_mask);			\
 }									\
 static struct sysdev_class_attribute attr_##type##_map = 		\
 	_SYSDEV_CLASS_ATTR(type, 0444, print_cpus_##type, NULL)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 90c6074a36ca..2e0d79678deb 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -90,12 +90,12 @@ static inline void cpuset_init_smp(void) {}
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
-	*mask = cpu_possible_map;
+	cpumask_copy(mask, cpu_possible_mask);
 }
 static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
 					      struct cpumask *mask)
 {
-	*mask = cpu_possible_map;
+	cpumask_copy(mask, cpu_possible_mask);
 }
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1f0c509b40d3..9aedd9fd825b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -416,7 +416,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,7 +547,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -911,7 +911,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
  	cpu_maps_update_done();
 
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 1882923bc706..139d5b7b6621 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -143,7 +143,7 @@ void free_percpu(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
-	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
+	__percpu_depopulate_mask(__pdata, cpu_possible_mask);
 	kfree(__percpu_disguise(__pdata));
 }
 EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 91149746bb8d..8cd81ea1ddc1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
 
 	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
 
-	for_each_cpu_mask_nr(cpu, *cpumask) {
+	for_each_cpu(cpu, cpumask) {
 		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 
 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af1..bb507e2bb94d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -312,7 +312,7 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 	switch (m->mode) {
 	case SVC_POOL_PERCPU:
 	{
-		set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
+		set_cpus_allowed_ptr(task, cpumask_of(node));
 		break;
 	}
 	case SVC_POOL_PERNODE:
-- 
cgit v1.2.3


From 19cefdffbfe0f7e280f21e80875937e8700e99e2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 15 Mar 2009 06:03:11 +0100
Subject: lockdep: annotate reclaim context (__GFP_NOFS), fix SLOB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: build fix

fix typo in mm/slob.c:

 mm/slob.c:469: error: ‘flags’ undeclared (first use in this function)
 mm/slob.c:469: error: (Each undeclared identifier is reported only once
 mm/slob.c:469: error: for each function it appears in.)

Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090128135457.350751756@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/slob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index 1264799df5d1..4b1c0c1d63cb 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -464,7 +464,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 	unsigned int *m;
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
-	lockdep_trace_alloc(flags);
+	lockdep_trace_alloc(gfp);
 
 	if (size < PAGE_SIZE - align) {
 		if (!size)
-- 
cgit v1.2.3