[PATCH] cpuset memory spread basic implementation

This patch provides the implementation and cpuset interface for an alternative memory allocation policy that can be applied to certain kinds of memory allocations, such as the page cache (file system buffers) and some slab caches (such as inode caches). The policy is called "memory spreading." If enabled, it spreads out these kinds of memory allocations over all the nodes allowed to a task, instead of preferring to place them on the node where the task is executing. All other kinds of allocations, including anonymous pages for a tasks stack and data regions, are not affected by this policy choice, and continue to be allocated preferring the node local to execution, as modified by the NUMA mempolicy. There are two boolean flag files per cpuset that control where the kernel allocates pages for the file system buffers and related in kernel data structures. They are called 'memory_spread_page' and 'memory_spread_slab'. If the per-cpuset boolean flag file 'memory_spread_page' is set, then the kernel will spread the file system buffers (page cache) evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the kernel will spread some file system related slab caches, such as for inodes and dentries evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. The implementation is simple. Setting the cpuset flags 'memory_spread_page' or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or subsequently joins that cpuset. In subsequent patches, the page allocation calls for the affected page cache and slab caches are modified to perform an inline check for these flags, and if set, a call to a new routine cpuset_mem_spread_node() returns the node to prefer for the allocation. The cpuset_mem_spread_node() routine is also simple. It uses the value of a per-task rotor cpuset_mem_spread_rotor to select the next node in the current tasks mems_allowed to prefer for the allocation. This policy can provide substantial improvements for jobs that need to place thread local data on the corresponding node, but that need to access large file system data sets that need to be spread across the several nodes in the jobs cpuset in order to fit. Without this patch, especially for jobs that might have one thread reading in the data set, the memory allocation across the nodes in the jobs cpuset can become very uneven. A couple of Copyright year ranges are updated as well. And a couple of email addresses that can be found in the MAINTAINERS file are removed. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Paul Jackson <pj@sgi.com> 2006-03-24 12:16:03 +0100
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-03-24 16:33:22 +0100
commit: 825a46af5ac171f9f41f794a0a00165588ba1589 (patch)
tree: b690fe9d809d7b047f0393097fc79892e1217d98 /include
parent: [PATCH] cpuset use combined atomic_inc_return calls (diff)
download: linux-825a46af5ac171f9f41f794a0a00165588ba1589.tar.xz
linux-825a46af5ac171f9f41f794a0a00165588ba1589.zip
2 files changed, 31 insertions, 1 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3bc606927116..9354722a9217 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -4,7 +4,7 @@
  *  cpuset interface
  *
  *  Copyright (C) 2003 BULL SA
- *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  */
 
@@ -51,6 +51,18 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 extern void cpuset_lock(void);
 extern void cpuset_unlock(void);
 
+extern int cpuset_mem_spread_node(void);
+
+static inline int cpuset_do_page_mem_spread(void)
+{
+	return current->flags & PF_SPREAD_PAGE;
+}
+
+static inline int cpuset_do_slab_mem_spread(void)
+{
+	return current->flags & PF_SPREAD_SLAB;
+}
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -99,6 +111,21 @@ static inline char *cpuset_task_status_allowed(struct task_struct *task,
 static inline void cpuset_lock(void) {}
 static inline void cpuset_unlock(void) {}
 
+static inline int cpuset_mem_spread_node(void)
+{
+	return 0;
+}
+
+static inline int cpuset_do_page_mem_spread(void)
+{
+	return 0;
+}
+
+static inline int cpuset_do_slab_mem_spread(void)
+{
+	return 0;
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e60a91d5b369..b0e37cfa09f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -869,6 +869,7 @@ struct task_struct {
 	struct cpuset *cpuset;
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
+	int cpuset_mem_spread_rotor;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
@@ -929,6 +930,8 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
author	Paul Jackson <pj@sgi.com>	2006-03-24 12:16:03 +0100
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-03-24 16:33:22 +0100
commit	825a46af5ac171f9f41f794a0a00165588ba1589 (patch)
tree	b690fe9d809d7b047f0393097fc79892e1217d98 /include
parent	[PATCH] cpuset use combined atomic_inc_return calls (diff)
download	linux-825a46af5ac171f9f41f794a0a00165588ba1589.tar.xz linux-825a46af5ac171f9f41f794a0a00165588ba1589.zip