diff options
author | Len Brown <len.brown@intel.com> | 2005-12-06 23:31:30 +0100 |
---|---|---|
committer | Len Brown <len.brown@intel.com> | 2005-12-06 23:31:30 +0100 |
commit | 3d5271f9883cba7b54762bc4fe027d4172f06db7 (patch) | |
tree | ab8a881a14478598a0c8bda0d26c62cdccfffd6d /kernel | |
parent | [ACPI] 8250_acpi.c buildfix (diff) | |
parent | Auto-update from upstream (diff) | |
download | linux-3d5271f9883cba7b54762bc4fe027d4172f06db7.tar.xz linux-3d5271f9883cba7b54762bc4fe027d4172f06db7.zip |
Pull release into acpica branch
Diffstat (limited to 'kernel')
43 files changed, 2573 insertions, 1620 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ff4dc02ce170..4f5a1453093a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o @@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index b756f527497e..6312d6bd43e3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -54,6 +54,7 @@ #include <linux/jiffies.h> #include <linux/times.h> #include <linux/syscalls.h> +#include <linux/mount.h> #include <asm/uaccess.h> #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ @@ -192,6 +193,7 @@ static void acct_file_reopen(struct file *file) add_timer(&acct_globals.timer); } if (old_acct) { + mnt_unpin(old_acct->f_vfsmnt); spin_unlock(&acct_globals.lock); do_acct_process(0, old_acct); filp_close(old_acct, NULL); @@ -199,6 +201,42 @@ static void acct_file_reopen(struct file *file) } } +static int acct_on(char *name) +{ + struct file *file; + int error; + + /* Difference from BSD - they don't do O_APPEND */ + file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + filp_close(file, NULL); + return -EACCES; + } + + if (!file->f_op->write) { + filp_close(file, NULL); + return -EIO; + } + + error = security_acct(file); + if (error) { + filp_close(file, NULL); + return error; + } + + spin_lock(&acct_globals.lock); + mnt_pin(file->f_vfsmnt); + acct_file_reopen(file); + spin_unlock(&acct_globals.lock); + + mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ + + return 0; +} + /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting @@ -212,47 +250,41 @@ static void acct_file_reopen(struct file *file) */ asmlinkage long sys_acct(const char __user *name) { - struct file *file = NULL; - char *tmp; int error; if (!capable(CAP_SYS_PACCT)) return -EPERM; if (name) { - tmp = getname(name); - if (IS_ERR(tmp)) { + char *tmp = getname(name); + if (IS_ERR(tmp)) return (PTR_ERR(tmp)); - } - /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + error = acct_on(tmp); putname(tmp); - if (IS_ERR(file)) { - return (PTR_ERR(file)); - } - if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { - filp_close(file, NULL); - return (-EACCES); - } - - if (!file->f_op->write) { - filp_close(file, NULL); - return (-EIO); + } else { + error = security_acct(NULL); + if (!error) { + spin_lock(&acct_globals.lock); + acct_file_reopen(NULL); + spin_unlock(&acct_globals.lock); } } + return error; +} - error = security_acct(file); - if (error) { - if (file) - filp_close(file, NULL); - return error; - } - +/** + * acct_auto_close - turn off a filesystem's accounting if it is on + * @m: vfsmount being shut down + * + * If the accounting is turned on for a file in the subtree pointed to + * to by m, turn accounting off. Done when m is about to die. + */ +void acct_auto_close_mnt(struct vfsmount *m) +{ spin_lock(&acct_globals.lock); - acct_file_reopen(file); + if (acct_globals.file && acct_globals.file->f_vfsmnt == m) + acct_file_reopen(NULL); spin_unlock(&acct_globals.lock); - - return (0); } /** @@ -266,8 +298,8 @@ void acct_auto_close(struct super_block *sb) { spin_lock(&acct_globals.lock); if (acct_globals.file && - acct_globals.file->f_dentry->d_inode->i_sb == sb) { - acct_file_reopen((struct file *)NULL); + acct_globals.file->f_vfsmnt->mnt_sb == sb) { + acct_file_reopen(NULL); } spin_unlock(&acct_globals.lock); } @@ -553,7 +585,7 @@ void acct_update_integrals(struct task_struct *tsk) if (delta == 0) return; tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; } } diff --git a/kernel/audit.c b/kernel/audit.c index 83096b67510a..0c56320d38dc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -133,7 +133,7 @@ struct audit_buffer { struct list_head list; struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ - int gfp_mask; + gfp_t gfp_mask; }; static void audit_set_pid(struct audit_buffer *ab, pid_t pid) @@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab) } static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, - unsigned int __nocast gfp_mask, int type) + gfp_t gfp_mask, int type) { unsigned long flags; struct audit_buffer *ab = NULL; @@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ -struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, +struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab = NULL; @@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab) /* Log an audit record. This is a convenience function that calls * audit_log_start, audit_log_vformat, and audit_log_end. It may be * called in any context. */ -void audit_log(struct audit_context *ctx, int gfp_mask, int type, +void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 88696f639aab..d8a68509e729 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab) up_read(&mm->mmap_sem); } -static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) +static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) { int i; struct audit_buffer *ab; diff --git a/kernel/cpu.c b/kernel/cpu.c index 53d8263ae12e..e882c6babf41 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -16,28 +16,76 @@ #include <asm/semaphore.h> /* This protects CPUs going up and down... */ -DECLARE_MUTEX(cpucontrol); +static DECLARE_MUTEX(cpucontrol); static struct notifier_block *cpu_chain; +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct *lock_cpu_hotplug_owner; +static int lock_cpu_hotplug_depth; + +static int __lock_cpu_hotplug(int interruptible) +{ + int ret = 0; + + if (lock_cpu_hotplug_owner != current) { + if (interruptible) + ret = down_interruptible(&cpucontrol); + else + down(&cpucontrol); + } + + /* + * Set only if we succeed in locking + */ + if (!ret) { + lock_cpu_hotplug_depth++; + lock_cpu_hotplug_owner = current; + } + + return ret; +} + +void lock_cpu_hotplug(void) +{ + __lock_cpu_hotplug(0); +} +EXPORT_SYMBOL_GPL(lock_cpu_hotplug); + +void unlock_cpu_hotplug(void) +{ + if (--lock_cpu_hotplug_depth == 0) { + lock_cpu_hotplug_owner = NULL; + up(&cpucontrol); + } +} +EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); + +int lock_cpu_hotplug_interruptible(void) +{ + return __lock_cpu_hotplug(1); +} +EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); +#endif /* CONFIG_HOTPLUG_CPU */ + /* Need to know about CPUs going up/down? */ int register_cpu_notifier(struct notifier_block *nb) { int ret; - if ((ret = down_interruptible(&cpucontrol)) != 0) + if ((ret = lock_cpu_hotplug_interruptible()) != 0) return ret; ret = notifier_chain_register(&cpu_chain, nb); - up(&cpucontrol); + unlock_cpu_hotplug(); return ret; } EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) { - down(&cpucontrol); + lock_cpu_hotplug(); notifier_chain_unregister(&cpu_chain, nb); - up(&cpucontrol); + unlock_cpu_hotplug(); } EXPORT_SYMBOL(unregister_cpu_notifier); @@ -155,13 +203,14 @@ int __devinit cpu_up(unsigned int cpu) int ret; void *hcpu = (void *)(long)cpu; - if ((ret = down_interruptible(&cpucontrol)) != 0) + if ((ret = lock_cpu_hotplug_interruptible()) != 0) return ret; if (cpu_online(cpu) || !cpu_present(cpu)) { ret = -EINVAL; goto out; } + ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", @@ -184,6 +233,6 @@ out_notify: if (ret != 0) notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); out: - up(&cpucontrol); + unlock_cpu_hotplug(); return ret; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 79866bc6b3a1..7430640f9816 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -32,6 +32,7 @@ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/list.h> +#include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mount.h> @@ -60,6 +61,9 @@ struct cpuset { cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + /* + * Count is atomic so can incr (fork) or decr (exit) without a lock. + */ atomic_t count; /* count tasks using this cpuset */ /* @@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount; static struct super_block *cpuset_sb = NULL; /* - * cpuset_sem should be held by anyone who is depending on the children - * or sibling lists of any cpuset, or performing non-atomic operations - * on the flags or *_allowed values of a cpuset, such as raising the - * CS_REMOVED flag bit iff it is not already raised, or reading and - * conditionally modifying the *_allowed values. One kernel global - * cpuset semaphore should be sufficient - these things don't change - * that much. - * - * The code that modifies cpusets holds cpuset_sem across the entire - * operation, from cpuset_common_file_write() down, single threading - * all cpuset modifications (except for counter manipulations from - * fork and exit) across the system. This presumes that cpuset - * modifications are rare - better kept simple and safe, even if slow. - * - * The code that reads cpusets, such as in cpuset_common_file_read() - * and below, only holds cpuset_sem across small pieces of code, such - * as when reading out possibly multi-word cpumasks and nodemasks, as - * the risks are less, and the desire for performance a little greater. - * The proc_cpuset_show() routine needs to hold cpuset_sem to insure - * that no cs->dentry is NULL, as it walks up the cpuset tree to root. - * - * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't - * (usually) grab cpuset_sem. These are the two most performance - * critical pieces of code here. The exception occurs on exit(), - * when a task in a notify_on_release cpuset exits. Then cpuset_sem + * We have two global cpuset semaphores below. They can nest. + * It is ok to first take manage_sem, then nest callback_sem. We also + * require taking task_lock() when dereferencing a tasks cpuset pointer. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold both semaphores to modify cpusets. If a task + * holds manage_sem, then it blocks others wanting that semaphore, + * ensuring that it is the only task able to also acquire callback_sem + * and be able to modify cpusets. It can perform various checks on + * the cpuset structure first, knowing nothing will change. It can + * also allocate memory while just holding manage_sem. While it is + * performing these checks, various callback routines can briefly + * acquire callback_sem to query cpusets. Once it is ready to make + * the changes, it takes callback_sem, blocking everyone else. + * + * Calls to the kernel memory allocator can not be made while holding + * callback_sem, as that would risk double tripping on callback_sem + * from one of the callbacks into the cpuset code from within + * __alloc_pages(). + * + * If a task is only holding callback_sem, then it has read-only + * access to cpusets. + * + * The task_struct fields mems_allowed and mems_generation may only + * be accessed in the context of that task, so require no locks. + * + * Any task can increment and decrement the count field without lock. + * So in general, code holding manage_sem or callback_sem can't rely + * on the count field not changing. However, if the count goes to + * zero, then only attach_task(), which holds both semaphores, can + * increment it again. Because a count of zero means that no tasks + * are currently attached, therefore there is no way a task attached + * to that cpuset can fork (the other way to increment the count). + * So code holding manage_sem or callback_sem can safely assume that + * if the count is zero, it will stay zero. Similarly, if a task + * holds manage_sem or callback_sem on a cpuset with zero count, it + * knows that the cpuset won't be removed, as cpuset_rmdir() needs + * both of those semaphores. + * + * A possible optimization to improve parallelism would be to make + * callback_sem a R/W semaphore (rwsem), allowing the callback routines + * to proceed in parallel, with read access, until the holder of + * manage_sem needed to take this rwsem for exclusive write access + * and modify some cpusets. + * + * The cpuset_common_file_write handler for operations that modify + * the cpuset hierarchy holds manage_sem across the entire operation, + * single threading all such cpuset modifications across the system. + * + * The cpuset_common_file_read() handlers only hold callback_sem across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + * + * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't + * (usually) take either semaphore. These are the two most performance + * critical pieces of code here. The exception occurs on cpuset_exit(), + * when a task in a notify_on_release cpuset exits. Then manage_sem * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * - * A cpuset can only be deleted if both its 'count' of using tasks is - * zero, and its list of 'children' cpusets is empty. Since all tasks - * in the system use _some_ cpuset, and since there is always at least - * one task in the system (init, pid == 1), therefore, top_cpuset - * always has either children cpusets and/or using tasks. So no need - * for any special hack to ensure that top_cpuset cannot be deleted. + * A cpuset can only be deleted if both its 'count' of using tasks + * is zero, and its list of 'children' cpusets is empty. Since all + * tasks in the system use _some_ cpuset, and since there is always at + * least one task in the system (init, pid == 1), therefore, top_cpuset + * always has either children cpusets and/or using tasks. So we don't + * need a special hack to ensure that top_cpuset cannot be deleted. + * + * The above "Tale of Two Semaphores" would be complete, but for: + * + * The task_lock() exception + * + * The need for this exception arises from the action of attach_task(), + * which overwrites one tasks cpuset pointer with another. It does + * so using both semaphores, however there are several performance + * critical places that need to reference task->cpuset without the + * expense of grabbing a system global semaphore. Therefore except as + * noted below, when dereferencing or, as in attach_task(), modifying + * a tasks cpuset pointer we use task_lock(), which acts on a spinlock + * (task->alloc_lock) already in the task_struct routinely used for + * such matters. */ -static DECLARE_MUTEX(cpuset_sem); -static struct task_struct *cpuset_sem_owner; -static int cpuset_sem_depth; - -/* - * The global cpuset semaphore cpuset_sem can be needed by the - * memory allocator to update a tasks mems_allowed (see the calls - * to cpuset_update_current_mems_allowed()) or to walk up the - * cpuset hierarchy to find a mem_exclusive cpuset see the calls - * to cpuset_excl_nodes_overlap()). - * - * But if the memory allocation is being done by cpuset.c code, it - * usually already holds cpuset_sem. Double tripping on a kernel - * semaphore deadlocks the current task, and any other task that - * subsequently tries to obtain the lock. - * - * Run all up's and down's on cpuset_sem through the following - * wrappers, which will detect this nested locking, and avoid - * deadlocking. - */ - -static inline void cpuset_down(struct semaphore *psem) -{ - if (cpuset_sem_owner != current) { - down(psem); - cpuset_sem_owner = current; - } - cpuset_sem_depth++; -} - -static inline void cpuset_up(struct semaphore *psem) -{ - if (--cpuset_sem_depth == 0) { - cpuset_sem_owner = NULL; - up(psem); - } -} +static DECLARE_MUTEX(manage_sem); +static DECLARE_MUTEX(callback_sem); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) } /* - * Call with cpuset_sem held. Writes path of cpuset into buf. + * Call with manage_sem held. Writes path of cpuset into buf. * Returns 0 on success, -errno on error. */ @@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * status of the /sbin/cpuset_release_agent task, so no sense holding * our caller up for that. * - * The simple act of forking that task might require more memory, - * which might need cpuset_sem. So this routine must be called while - * cpuset_sem is not held, to avoid a possible deadlock. See also - * comments for check_for_release(), below. + * When we had only one cpuset semaphore, we had to call this + * without holding it, to avoid deadlock when call_usermodehelper() + * allocated memory. With two locks, we could now call this while + * holding manage_sem, but we still don't, so as to minimize + * the time manage_sem is held. */ static void cpuset_release_agent(const char *pathbuf) @@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf) * cs is notify_on_release() and now both the user count is zero and * the list of children is empty, prepare cpuset path in a kmalloc'd * buffer, to be returned via ppathbuf, so that the caller can invoke - * cpuset_release_agent() with it later on, once cpuset_sem is dropped. - * Call here with cpuset_sem held. + * cpuset_release_agent() with it later on, once manage_sem is dropped. + * Call here with manage_sem held. * * This check_for_release() routine is responsible for kmalloc'ing * pathbuf. The above cpuset_release_agent() is responsible for * kfree'ing pathbuf. The caller of these routines is responsible * for providing a pathbuf pointer, initialized to NULL, then - * calling check_for_release() with cpuset_sem held and the address - * of the pathbuf pointer, then dropping cpuset_sem, then calling + * calling check_for_release() with manage_sem held and the address + * of the pathbuf pointer, then dropping manage_sem, then calling * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ @@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) * One way or another, we guarantee to return some non-empty subset * of cpu_online_map. * - * Call with cpuset_sem held. + * Call with callback_sem held. */ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) @@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) * One way or another, we guarantee to return some non-empty subset * of node_online_map. * - * Call with cpuset_sem held. + * Call with callback_sem held. */ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) @@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) } /* - * Refresh current tasks mems_allowed and mems_generation from - * current tasks cpuset. Call with cpuset_sem held. + * Refresh current tasks mems_allowed and mems_generation from current + * tasks cpuset. + * + * Call without callback_sem or task_lock() held. May be called with + * or without manage_sem held. Will acquire task_lock() and might + * acquire callback_sem during call. + * + * The task_lock() is required to dereference current->cpuset safely. + * Without it, we could pick up the pointer value of current->cpuset + * in one instruction, and then attach_task could give us a different + * cpuset, and then the cpuset we had could be removed and freed, + * and then on our next instruction, we could dereference a no longer + * valid cpuset pointer to get its mems_generation field. * - * This routine is needed to update the per-task mems_allowed - * data, within the tasks context, when it is trying to allocate - * memory (in various mm/mempolicy.c routines) and notices - * that some other task has been modifying its cpuset. + * This routine is needed to update the per-task mems_allowed data, + * within the tasks context, when it is trying to allocate memory + * (in various mm/mempolicy.c routines) and notices that some other + * task has been modifying its cpuset. */ static void refresh_mems(void) { - struct cpuset *cs = current->cpuset; + int my_cpusets_mem_gen; - if (current->cpuset_mems_generation != cs->mems_generation) { + task_lock(current); + my_cpusets_mem_gen = current->cpuset->mems_generation; + task_unlock(current); + + if (current->cpuset_mems_generation != my_cpusets_mem_gen) { + struct cpuset *cs; + nodemask_t oldmem = current->mems_allowed; + + down(&callback_sem); + task_lock(current); + cs = current->cpuset; guarantee_online_mems(cs, ¤t->mems_allowed); current->cpuset_mems_generation = cs->mems_generation; + task_unlock(current); + up(&callback_sem); + if (!nodes_equal(oldmem, current->mems_allowed)) + numa_policy_rebind(&oldmem, ¤t->mems_allowed); } } @@ -579,7 +620,7 @@ static void refresh_mems(void) * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. + * are only set if the other's are set. Call holding manage_sem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_sem held. + * manage_sem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * exclusive child cpusets * Build these two partitions by calling partition_sched_domains * - * Call with cpuset_sem held. May nest a call to the + * Call with manage_sem held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ @@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur) unlock_cpu_hotplug(); } +/* + * Call with manage_sem held. May take callback_sem during call. + */ + static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; @@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (retval < 0) return retval; cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); + down(&callback_sem); cs->cpus_allowed = trialcs.cpus_allowed; + up(&callback_sem); if (is_cpu_exclusive(cs) && !cpus_unchanged) update_cpu_domains(cs); return 0; } +/* + * Call with manage_sem held. May take callback_sem during call. + */ + static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; @@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval == 0) { + down(&callback_sem); cs->mems_allowed = trialcs.mems_allowed; atomic_inc(&cpuset_mems_generation); cs->mems_generation = atomic_read(&cpuset_mems_generation); + up(&callback_sem); } return retval; } @@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) * CS_NOTIFY_ON_RELEASE) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 + * + * Call with manage_sem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) @@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return err; cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); + down(&callback_sem); if (turning_on) set_bit(bit, &cs->flags); else clear_bit(bit, &cs->flags); + up(&callback_sem); if (cpu_exclusive_changed) update_cpu_domains(cs); return 0; } +/* + * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly + * writing the path of the old cpuset in 'ppathbuf' if it needs to be + * notified on release. + * + * Call holding manage_sem. May take callback_sem and task_lock of + * the task 'pid' during call. + */ + static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) { pid_t pid; @@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); - if (!tsk) { + if (!tsk || tsk->flags & PF_EXITING) { read_unlock(&tasklist_lock); return -ESRCH; } @@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + down(&callback_sem); + task_lock(tsk); oldcs = tsk->cpuset; if (!oldcs) { task_unlock(tsk); + up(&callback_sem); put_task_struct(tsk); return -ESRCH; } @@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); + up(&callback_sem); put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); @@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - cpuset_down(&cpuset_sem); + down(&manage_sem); if (is_removed(cs)) { retval = -ENODEV; @@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - cpuset_up(&cpuset_sem); + up(&manage_sem); cpuset_release_agent(pathbuf); out1: kfree(buffer); @@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - cpuset_down(&cpuset_sem); + down(&callback_sem); mask = cs->cpus_allowed; - cpuset_up(&cpuset_sem); + up(&callback_sem); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - cpuset_down(&cpuset_sem); + down(&callback_sem); mask = cs->mems_allowed; - cpuset_up(&cpuset_sem); + up(&callback_sem); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -968,8 +1038,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, char *page; ssize_t retval = 0; char *s; - char *start; - size_t n; if (!(page = (char *)__get_free_page(GFP_KERNEL))) return -ENOMEM; @@ -997,16 +1065,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, goto out; } *s++ = '\n'; - *s = '\0'; - /* Do nothing if *ppos is at the eof or beyond the eof. */ - if (s - page <= *ppos) - return 0; - - start = page + *ppos; - n = s - start; - retval = n - copy_to_user(buf, start, min(n, nbytes)); - *ppos += retval; + retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); out: free_page((unsigned long)page); return retval; @@ -1057,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file) return 0; } +/* + * cpuset_rename - Only allow simple rename of directories in place. + */ +static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (!S_ISDIR(old_dentry->d_inode->i_mode)) + return -ENOTDIR; + if (new_dentry->d_inode) + return -EEXIST; + if (old_dir != new_dir) + return -EIO; + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static struct file_operations cpuset_file_operations = { .read = cpuset_file_read, .write = cpuset_file_write, @@ -1069,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cpuset_mkdir, .rmdir = cpuset_rmdir, + .rename = cpuset_rename, }; static int cpuset_create_file(struct dentry *dentry, int mode) @@ -1172,7 +1248,9 @@ struct ctr_struct { /* * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. - * Return actual number of pids loaded. + * Return actual number of pids loaded. No need to task_lock(p) + * when reading out p->cpuset, as we don't really care if it changes + * on the next cycle, and we are not going to try to dereference it. */ static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) { @@ -1214,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) return cnt; } +/* + * Handle an open on 'tasks' file. Prepare a buffer listing the + * process id's of tasks currently attached to the cpuset being opened. + * + * Does not require any specific cpuset semaphores, and does not take any. + */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { struct cpuset *cs = __d_cs(file->f_dentry->d_parent); @@ -1361,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - cpuset_down(&cpuset_sem); + down(&manage_sem); + refresh_mems(); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); @@ -1375,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) cs->parent = parent; + down(&callback_sem); list_add(&cs->sibling, &cs->parent->children); + up(&callback_sem); err = cpuset_create_dir(cs, name, mode); if (err < 0) goto err; /* - * Release cpuset_sem before cpuset_populate_dir() because it + * Release manage_sem before cpuset_populate_dir() because it * will down() this new directory's i_sem and if we race with * another mkdir, we might deadlock. */ - cpuset_up(&cpuset_sem); + up(&manage_sem); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - cpuset_up(&cpuset_sem); + up(&manage_sem); kfree(cs); return err; } @@ -1415,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_sem already */ - cpuset_down(&cpuset_sem); + down(&manage_sem); + refresh_mems(); if (atomic_read(&cs->count) > 0) { - cpuset_up(&cpuset_sem); + up(&manage_sem); return -EBUSY; } if (!list_empty(&cs->children)) { - cpuset_up(&cpuset_sem); + up(&manage_sem); return -EBUSY; } parent = cs->parent; + down(&callback_sem); set_bit(CS_REMOVED, &cs->flags); if (is_cpu_exclusive(cs)) update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ - if (list_empty(&parent->children)) - check_for_release(parent, &pathbuf); spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); - cpuset_up(&cpuset_sem); + up(&callback_sem); + if (list_empty(&parent->children)) + check_for_release(parent, &pathbuf); + up(&manage_sem); cpuset_release_agent(pathbuf); return 0; } @@ -1497,16 +1587,26 @@ void __init cpuset_init_smp(void) * cpuset_fork - attach newly forked task to its parents cpuset. * @tsk: pointer to task_struct of forking parent process. * - * Description: By default, on fork, a task inherits its - * parent's cpuset. The pointer to the shared cpuset is - * automatically copied in fork.c by dup_task_struct(). - * This cpuset_fork() routine need only increment the usage - * counter in that cpuset. + * Description: A task inherits its parent's cpuset at fork(). + * + * A pointer to the shared cpuset was automatically copied in fork.c + * by dup_task_struct(). However, we ignore that copy, since it was + * not made under the protection of task_lock(), so might no longer be + * a valid cpuset pointer. attach_task() might have already changed + * current->cpuset, allowing the previously referenced cpuset to + * be removed and freed. Instead, we task_lock(current) and copy + * its present value of current->cpuset for our freshly forked child. + * + * At the point that cpuset_fork() is called, 'current' is the parent + * task, and the passed argument 'child' points to the child task. **/ -void cpuset_fork(struct task_struct *tsk) +void cpuset_fork(struct task_struct *child) { - atomic_inc(&tsk->cpuset->count); + task_lock(current); + child->cpuset = current->cpuset; + atomic_inc(&child->cpuset->count); + task_unlock(current); } /** @@ -1515,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk) * * Description: Detach cpuset from @tsk and release it. * - * Note that cpusets marked notify_on_release force every task - * in them to take the global cpuset_sem semaphore when exiting. - * This could impact scaling on very large systems. Be reluctant - * to use notify_on_release cpusets where very high task exit - * scaling is required on large systems. - * - * Don't even think about derefencing 'cs' after the cpuset use - * count goes to zero, except inside a critical section guarded - * by the cpuset_sem semaphore. If you don't hold cpuset_sem, - * then a zero cpuset use count is a license to any other task to - * nuke the cpuset immediately. + * Note that cpusets marked notify_on_release force every task in + * them to take the global manage_sem semaphore when exiting. + * This could impact scaling on very large systems. Be reluctant to + * use notify_on_release cpusets where very high task exit scaling + * is required on large systems. + * + * Don't even think about derefencing 'cs' after the cpuset use count + * goes to zero, except inside a critical section guarded by manage_sem + * or callback_sem. Otherwise a zero cpuset use count is a license to + * any other task to nuke the cpuset immediately, via cpuset_rmdir(). + * + * This routine has to take manage_sem, not callback_sem, because + * it is holding that semaphore while calling check_for_release(), + * which calls kmalloc(), so can't be called holding callback__sem(). + * + * We don't need to task_lock() this reference to tsk->cpuset, + * because tsk is already marked PF_EXITING, so attach_task() won't + * mess with it. **/ void cpuset_exit(struct task_struct *tsk) { struct cpuset *cs; - task_lock(tsk); + BUG_ON(!(tsk->flags & PF_EXITING)); + cs = tsk->cpuset; tsk->cpuset = NULL; - task_unlock(tsk); if (notify_on_release(cs)) { char *pathbuf = NULL; - cpuset_down(&cpuset_sem); + down(&manage_sem); if (atomic_dec_and_test(&cs->count)) check_for_release(cs, &pathbuf); - cpuset_up(&cpuset_sem); + up(&manage_sem); cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); @@ -1564,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) { cpumask_t mask; - cpuset_down(&cpuset_sem); + down(&callback_sem); task_lock((struct task_struct *)tsk); guarantee_online_cpus(tsk->cpuset, &mask); task_unlock((struct task_struct *)tsk); - cpuset_up(&cpuset_sem); + up(&callback_sem); return mask; } @@ -1584,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void) * If the current tasks cpusets mems_allowed changed behind our backs, * update current->mems_allowed and mems_generation to the new value. * Do not call this routine if in_interrupt(). + * + * Call without callback_sem or task_lock() held. May be called + * with or without manage_sem held. Unless exiting, it will acquire + * task_lock(). Also might acquire callback_sem during call to + * refresh_mems(). */ void cpuset_update_current_mems_allowed(void) { - struct cpuset *cs = current->cpuset; + struct cpuset *cs; + int need_to_refresh = 0; + task_lock(current); + cs = current->cpuset; if (!cs) - return; /* task is exiting */ - if (current->cpuset_mems_generation != cs->mems_generation) { - cpuset_down(&cpuset_sem); + goto done; + if (current->cpuset_mems_generation != cs->mems_generation) + need_to_refresh = 1; +done: + task_unlock(current); + if (need_to_refresh) refresh_mems(); - cpuset_up(&cpuset_sem); - } } /** @@ -1630,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) /* * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive - * ancestor to the specified cpuset. Call while holding cpuset_sem. + * ancestor to the specified cpuset. Call holding callback_sem. * If no ancestor is mem_exclusive (an unusual configuration), then * returns the root cpuset. */ @@ -1657,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_KERNEL allocations are not so marked, so can escape to the * nearest mem_exclusive ancestor cpuset. * - * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() + * Scanning up parent cpusets requires callback_sem. The __alloc_pages() * routine only calls here with __GFP_HARDWALL bit _not_ set if * it's a GFP_KERNEL allocation, and all nodes in the current tasks * mems_allowed came up empty on the first pass over the zonelist. * So only GFP_KERNEL allocations, if all nodes in the cpuset are - * short of memory, might require taking the cpuset_sem semaphore. + * short of memory, might require taking the callback_sem semaphore. * * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing @@ -1679,7 +1795,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_USER - only nodes in current tasks mems allowed ok. **/ -int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) +int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ @@ -1693,15 +1809,18 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return 0; + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return 1; + /* Not hardwall and node outside mems_allowed: scan up cpusets */ - cpuset_down(&cpuset_sem); - cs = current->cpuset; - if (!cs) - goto done; /* current task exiting */ - cs = nearest_exclusive_ancestor(cs); + down(&callback_sem); + + task_lock(current); + cs = nearest_exclusive_ancestor(current->cpuset); + task_unlock(current); + allowed = node_isset(node, cs->mems_allowed); -done: - cpuset_up(&cpuset_sem); + up(&callback_sem); return allowed; } @@ -1714,7 +1833,7 @@ done: * determine if task @p's memory usage might impact the memory * available to the current task. * - * Acquires cpuset_sem - not suitable for calling from a fast path. + * Acquires callback_sem - not suitable for calling from a fast path. **/ int cpuset_excl_nodes_overlap(const struct task_struct *p) @@ -1722,18 +1841,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ int overlap = 0; /* do cpusets overlap? */ - cpuset_down(&cpuset_sem); - cs1 = current->cpuset; - if (!cs1) - goto done; /* current task exiting */ - cs2 = p->cpuset; - if (!cs2) - goto done; /* task p is exiting */ - cs1 = nearest_exclusive_ancestor(cs1); - cs2 = nearest_exclusive_ancestor(cs2); + down(&callback_sem); + + task_lock(current); + if (current->flags & PF_EXITING) { + task_unlock(current); + goto done; + } + cs1 = nearest_exclusive_ancestor(current->cpuset); + task_unlock(current); + + task_lock((struct task_struct *)p); + if (p->flags & PF_EXITING) { + task_unlock((struct task_struct *)p); + goto done; + } + cs2 = nearest_exclusive_ancestor(p->cpuset); + task_unlock((struct task_struct *)p); + overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); done: - cpuset_up(&cpuset_sem); + up(&callback_sem); return overlap; } @@ -1742,6 +1870,10 @@ done: * proc_cpuset_show() * - Print tasks cpuset path into seq_file. * - Used for /proc/<pid>/cpuset. + * - No need to task_lock(tsk) on this tsk->cpuset reference, as it + * doesn't really matter if tsk->cpuset changes after we read it, + * and we take manage_sem, keeping attach_task() from changing it + * anyway. */ static int proc_cpuset_show(struct seq_file *m, void *v) @@ -1756,10 +1888,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) return -ENOMEM; tsk = m->private; - cpuset_down(&cpuset_sem); - task_lock(tsk); + down(&manage_sem); cs = tsk->cpuset; - task_unlock(tsk); if (!cs) { retval = -EINVAL; goto out; @@ -1771,7 +1901,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) seq_puts(m, buf); seq_putc(m, '\n'); out: - cpuset_up(&cpuset_sem); + up(&manage_sem); kfree(buf); return retval; } diff --git a/kernel/exit.c b/kernel/exit.c index ee6d8b8abef5..ee515683b92d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -28,6 +28,7 @@ #include <linux/cpuset.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <linux/cn_proc.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -547,7 +548,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) if (p->pdeath_signal) /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, (void *) 0, p); + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); /* Move the child from its dying parent to the new one. */ if (unlikely(traced)) { @@ -591,8 +592,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) int pgrp = process_group(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { - __kill_pg_info(SIGHUP, (void *)1, pgrp); - __kill_pg_info(SIGCONT, (void *)1, pgrp); + __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } } @@ -727,8 +728,8 @@ static void exit_notify(struct task_struct *tsk) (t->signal->session == tsk->signal->session) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { - __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); - __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); + __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); + __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); } /* Let father know we died @@ -783,10 +784,6 @@ static void exit_notify(struct task_struct *tsk) /* If the process is dead, release it - nobody will wait for it */ if (state == EXIT_DEAD) release_task(tsk); - - /* PF_DEAD causes final put_task_struct after we schedule. */ - preempt_disable(); - tsk->flags |= PF_DEAD; } fastcall NORET_TYPE void do_exit(long code) @@ -839,10 +836,14 @@ fastcall NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - update_mem_hiwater(tsk); + if (tsk->mm) { + update_hiwater_rss(tsk->mm); + update_hiwater_vm(tsk->mm); + } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { del_timer_sync(&tsk->signal->real_timer); + exit_itimers(tsk->signal); acct_process(code); } exit_mm(tsk); @@ -858,18 +859,23 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead && tsk->signal->leader) disassociate_ctty(1); - module_put(tsk->thread_info->exec_domain->module); + module_put(task_thread_info(tsk)->exec_domain->module); if (tsk->binfmt) module_put(tsk->binfmt->module); tsk->exit_code = code; + proc_exit_connector(tsk); exit_notify(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif - BUG_ON(!(current->flags & PF_DEAD)); + /* PF_DEAD causes final put_task_struct after we schedule. */ + preempt_disable(); + BUG_ON(tsk->flags & PF_DEAD); + tsk->flags |= PF_DEAD; + schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -1203,7 +1209,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, exit_code = p->exit_code; if (unlikely(!exit_code) || - unlikely(p->state > TASK_STOPPED)) + unlikely(p->state & TASK_TRACED)) goto bail_ref; return wait_noreap_copyout(p, pid, uid, why, (exit_code << 8) | 0x7f, @@ -1379,6 +1385,15 @@ repeat: switch (p->state) { case TASK_TRACED: + /* + * When we hit the race with PTRACE_ATTACH, + * we will not report this child. But the + * race means it has not yet been moved to + * our ptrace_children list, so we need to + * set the flag here to avoid a spurious ECHILD + * when the race happens with the only child. + */ + flag = 1; if (!my_ptrace_child(p)) continue; /*FALLTHROUGH*/ diff --git a/kernel/fork.c b/kernel/fork.c index 533ce27f4b2c..fb8572a42297 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include <linux/profile.h> #include <linux/rmap.h> #include <linux/acct.h> +#include <linux/cn_proc.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -170,10 +171,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - *ti = *orig->thread_info; *tsk = *orig; tsk->thread_info = ti; - ti->task = tsk; + setup_thread_stack(tsk, orig); /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); @@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } #ifdef CONFIG_MMU -static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) +static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct * mpnt, *tmp, **pprev; + struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); - flush_cache_mm(current->mm); + flush_cache_mm(oldmm); + down_write(&mm->mmap_sem); + mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; - set_mm_counter(mm, rss, 0); - set_mm_counter(mm, anon_rss, 0); cpus_clear(mm->cpu_vm_mask); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; - for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; - __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, + vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } @@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) } /* - * Link in the new vma and copy the page table entries: - * link in first so that swapoff can see swap entries. - * Note that, exceptionally, here the vma is inserted - * without holding mm->mmap_sem. + * Link in the new vma and copy the page table entries. */ - spin_lock(&mm->page_table_lock); *pprev = tmp; pprev = &tmp->vm_next; @@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, current->mm, tmp); - spin_unlock(&mm->page_table_lock); + retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) goto out; } retval = 0; - out: - flush_tlb_mm(current->mm); + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_policy: @@ -323,10 +318,11 @@ static struct mm_struct * mm_init(struct mm_struct * mm) INIT_LIST_HEAD(&mm->mmlist); mm->core_waiters = 0; mm->nr_ptes = 0; + set_mm_counter(mm, file_rss, 0); + set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; - mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; @@ -472,13 +468,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; - /* - * There are cases where the PTL is held to ensure no - * new threads start up in user mode using an mm, which - * allows optimizing out ipis; the tlb_gather_mmu code - * is an example. - */ - spin_unlock_wait(&oldmm->page_table_lock); goto good_mm; } @@ -499,7 +488,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) if (retval) goto free_pt; - mm->hiwater_rss = get_mm_counter(mm,rss); + mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; good_mm: @@ -848,7 +837,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); new_flags |= PF_FORKNOEXEC; if (!(clone_flags & CLONE_PTRACE)) p->ptrace = 0; @@ -928,7 +917,7 @@ static task_t *copy_process(unsigned long clone_flags, if (nr_threads >= max_threads) goto bad_fork_cleanup_count; - if (!try_module_get(p->thread_info->exec_domain->module)) + if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; if (p->binfmt && !try_module_get(p->binfmt->module)) @@ -1135,8 +1124,6 @@ static task_t *copy_process(unsigned long clone_flags, if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); - cpuset_fork(p); - attach_pid(p, PIDTYPE_PID, p->pid); attach_pid(p, PIDTYPE_TGID, p->tgid); if (thread_group_leader(p)) { @@ -1152,6 +1139,8 @@ static task_t *copy_process(unsigned long clone_flags, nr_threads++; total_forks++; write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); + cpuset_fork(p); retval = 0; fork_out: @@ -1188,7 +1177,7 @@ bad_fork_cleanup: if (p->binfmt) module_put(p->binfmt->module); bad_fork_cleanup_put_domain: - module_put(p->thread_info->exec_domain->module); + module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: put_group_info(p->group_info); atomic_dec(&p->user->processes); diff --git a/kernel/futex.c b/kernel/futex.c index ca05fe6a70b2..5872e3507f35 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -201,23 +201,6 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) * from swap. But that's a lot of code to duplicate here * for a rare case, so we simply fetch the page. */ - - /* - * Do a quick atomic lookup first - this is the fastpath. - */ - spin_lock(¤t->mm->page_table_lock); - page = follow_page(mm, uaddr, 0); - if (likely(page != NULL)) { - key->shared.pgoff = - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - spin_unlock(¤t->mm->page_table_lock); - return 0; - } - spin_unlock(¤t->mm->page_table_lock); - - /* - * Do it the general way. - */ err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); if (err >= 0) { key->shared.pgoff = @@ -367,6 +350,11 @@ retry: if (bh1 != bh2) spin_unlock(&bh2->lock); + if (unlikely(op_ret != -EFAULT)) { + ret = op_ret; + goto out; + } + /* futex_atomic_op_inuser needs to both read and write * *(int __user *)uaddr2, but we can't modify it * non-atomically. Therefore, if get_user below is not diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3ff7b925c387..51df337b37db 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -117,14 +117,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) /* * No locking required for CPU-local interrupts: */ - desc->handler->ack(irq); + if (desc->handler->ack) + desc->handler->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); desc->handler->end(irq); return 1; } spin_lock(&desc->lock); - desc->handler->ack(irq); + if (desc->handler->ack) + desc->handler->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1cfdb08ddf20..81c49a4d679e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -24,6 +24,7 @@ cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for * * This function waits for any pending IRQ handlers for this interrupt * to complete before returning. If you use this function while @@ -35,6 +36,9 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; + if (irq >= NR_IRQS) + return; + while (desc->status & IRQ_INPROGRESS) cpu_relax(); } @@ -59,6 +63,9 @@ void disable_irq_nosync(unsigned int irq) irq_desc_t *desc = irq_desc + irq; unsigned long flags; + if (irq >= NR_IRQS) + return; + spin_lock_irqsave(&desc->lock, flags); if (!desc->depth++) { desc->status |= IRQ_DISABLED; @@ -85,6 +92,9 @@ void disable_irq(unsigned int irq) { irq_desc_t *desc = irq_desc + irq; + if (irq >= NR_IRQS) + return; + disable_irq_nosync(irq); if (desc->action) synchronize_irq(irq); @@ -107,6 +117,9 @@ void enable_irq(unsigned int irq) irq_desc_t *desc = irq_desc + irq; unsigned long flags; + if (irq >= NR_IRQS) + return; + spin_lock_irqsave(&desc->lock, flags); switch (desc->depth) { case 0: @@ -162,6 +175,9 @@ int setup_irq(unsigned int irq, struct irqaction * new) unsigned long flags; int shared = 0; + if (irq >= NR_IRQS) + return -EINVAL; + if (desc->handler == &no_irq_type) return -ENOSYS; /* diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 13bcec151b57..39277dd6bf90 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -18,6 +18,7 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/proc_fs.h> +#include <linux/sched.h> /* for cond_resched */ #include <linux/mm.h> #include <asm/sections.h> diff --git a/kernel/kexec.c b/kernel/kexec.c index cdd4dcd8fb63..2c95848fbce8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p) static int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); static struct page *kimage_alloc_page(struct kimage *image, - unsigned int gfp_mask, + gfp_t gfp_mask, unsigned long dest); static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, @@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image, return 0; } -static struct page *kimage_alloc_pages(unsigned int gfp_mask, - unsigned int order) +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; @@ -335,7 +334,7 @@ static struct page *kimage_alloc_pages(unsigned int gfp_mask, if (pages) { unsigned int count, i; pages->mapping = NULL; - pages->private = order; + set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); @@ -348,7 +347,7 @@ static void kimage_free_pages(struct page *page) { unsigned int order, count, i; - order = page->private; + order = page_private(page); count = 1 << order; for (i = 0; i < count; i++) ClearPageReserved(page + i); @@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image, } static struct page *kimage_alloc_page(struct kimage *image, - unsigned int gfp_mask, + gfp_t gfp_mask, unsigned long destination) { /* diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 179baafcdd96..64ab045c3d9d 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -36,7 +36,7 @@ * struct kfifo with kfree(). */ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, - unsigned int __nocast gfp_mask, spinlock_t *lock) + gfp_t gfp_mask, spinlock_t *lock) { struct kfifo *fifo; @@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init); * * The size will be rounded-up to a power of 2. */ -struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock) +struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) { unsigned char *buffer; struct kfifo *ret; diff --git a/kernel/kmod.c b/kernel/kmod.c index 44166e3bb8af..51a892063aaa 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -131,14 +131,14 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; - struct key *old_session; + struct key *new_session, *old_session; int retval; /* Unblock all signals and set the session keyring. */ - key_get(sub_info->ring); + new_session = key_get(sub_info->ring); flush_signals(current); spin_lock_irq(¤t->sighand->siglock); - old_session = __install_session_keyring(current, sub_info->ring); + old_session = __install_session_keyring(current, new_session); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f3ea492ab44d..5beda378cc75 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -32,9 +32,9 @@ * <prasanna@in.ibm.com> added function-return probes. */ #include <linux/kprobes.h> -#include <linux/spinlock.h> #include <linux/hash.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/moduleloader.h> #include <asm-generic/sections.h> @@ -48,9 +48,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; -unsigned int kprobe_cpu = NR_CPUS; -static DEFINE_SPINLOCK(kprobe_lock); -static struct kprobe *curr_kprobe; +static DEFINE_SPINLOCK(kprobe_lock); /* Protects kprobe_table */ +DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ +static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; /* * kprobe->ainsn.insn points to the copy of the instruction to be @@ -152,50 +152,31 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) } } -/* Locks kprobe: irqs must be disabled */ -void __kprobes lock_kprobes(void) +/* We have preemption disabled.. so it is safe to use __ versions */ +static inline void set_kprobe_instance(struct kprobe *kp) { - unsigned long flags = 0; - - /* Avoiding local interrupts to happen right after we take the kprobe_lock - * and before we get a chance to update kprobe_cpu, this to prevent - * deadlock when we have a kprobe on ISR routine and a kprobe on task - * routine - */ - local_irq_save(flags); - - spin_lock(&kprobe_lock); - kprobe_cpu = smp_processor_id(); - - local_irq_restore(flags); + __get_cpu_var(kprobe_instance) = kp; } -void __kprobes unlock_kprobes(void) +static inline void reset_kprobe_instance(void) { - unsigned long flags = 0; - - /* Avoiding local interrupts to happen right after we update - * kprobe_cpu and before we get a a chance to release kprobe_lock, - * this to prevent deadlock when we have a kprobe on ISR routine and - * a kprobe on task routine - */ - local_irq_save(flags); - - kprobe_cpu = NR_CPUS; - spin_unlock(&kprobe_lock); - - local_irq_restore(flags); + __get_cpu_var(kprobe_instance) = NULL; } -/* You have to be holding the kprobe_lock */ +/* + * This routine is called either: + * - under the kprobe_lock spinlock - during kprobe_[un]register() + * OR + * - with preemption disabled - from arch/xxx/kernel/kprobes.c + */ struct kprobe __kprobes *get_kprobe(void *addr) { struct hlist_head *head; struct hlist_node *node; + struct kprobe *p; head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; - hlist_for_each(node, head) { - struct kprobe *p = hlist_entry(node, struct kprobe, hlist); + hlist_for_each_entry_rcu(p, node, head, hlist) { if (p->addr == addr) return p; } @@ -210,13 +191,13 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; - list_for_each_entry(kp, &p->list, list) { + list_for_each_entry_rcu(kp, &p->list, list) { if (kp->pre_handler) { - curr_kprobe = kp; + set_kprobe_instance(kp); if (kp->pre_handler(kp, regs)) return 1; } - curr_kprobe = NULL; + reset_kprobe_instance(); } return 0; } @@ -226,11 +207,11 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, { struct kprobe *kp; - list_for_each_entry(kp, &p->list, list) { + list_for_each_entry_rcu(kp, &p->list, list) { if (kp->post_handler) { - curr_kprobe = kp; + set_kprobe_instance(kp); kp->post_handler(kp, regs, flags); - curr_kprobe = NULL; + reset_kprobe_instance(); } } return; @@ -239,12 +220,14 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) { + struct kprobe *cur = __get_cpu_var(kprobe_instance); + /* * if we faulted "during" the execution of a user specified * probe handler, invoke just that probe's fault handler */ - if (curr_kprobe && curr_kprobe->fault_handler) { - if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr)) + if (cur && cur->fault_handler) { + if (cur->fault_handler(cur, regs, trapnr)) return 1; } return 0; @@ -252,17 +235,18 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { - struct kprobe *kp = curr_kprobe; - if (curr_kprobe && kp->break_handler) { - if (kp->break_handler(kp, regs)) { - curr_kprobe = NULL; - return 1; - } + struct kprobe *cur = __get_cpu_var(kprobe_instance); + int ret = 0; + + if (cur && cur->break_handler) { + if (cur->break_handler(cur, regs)) + ret = 1; } - curr_kprobe = NULL; - return 0; + reset_kprobe_instance(); + return ret; } +/* Called with kretprobe_lock held */ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) { struct hlist_node *node; @@ -272,6 +256,7 @@ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) return NULL; } +/* Called with kretprobe_lock held */ static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe *rp) { @@ -282,6 +267,7 @@ static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe return NULL; } +/* Called with kretprobe_lock held */ void __kprobes add_rp_inst(struct kretprobe_instance *ri) { /* @@ -300,6 +286,7 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->used_instances); } +/* Called with kretprobe_lock held */ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) { /* remove rp inst off the rprobe_inst_table */ @@ -333,13 +320,13 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) struct hlist_node *node, *tmp; unsigned long flags = 0; - spin_lock_irqsave(&kprobe_lock, flags); + spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { if (ri->task == tk) recycle_rp_inst(ri); } - spin_unlock_irqrestore(&kprobe_lock, flags); + spin_unlock_irqrestore(&kretprobe_lock, flags); } /* @@ -350,9 +337,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); + unsigned long flags = 0; /*TODO: consider to only swap the RA after the last pre_handler fired */ + spin_lock_irqsave(&kretprobe_lock, flags); arch_prepare_kretprobe(rp, regs); + spin_unlock_irqrestore(&kretprobe_lock, flags); return 0; } @@ -383,13 +373,13 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) struct kprobe *kp; if (p->break_handler) { - list_for_each_entry(kp, &old_p->list, list) { + list_for_each_entry_rcu(kp, &old_p->list, list) { if (kp->break_handler) return -EEXIST; } - list_add_tail(&p->list, &old_p->list); + list_add_tail_rcu(&p->list, &old_p->list); } else - list_add(&p->list, &old_p->list); + list_add_rcu(&p->list, &old_p->list); return 0; } @@ -407,18 +397,18 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); - list_add(&p->list, &ap->list); + list_add_rcu(&p->list, &ap->list); INIT_HLIST_NODE(&ap->hlist); - hlist_del(&p->hlist); - hlist_add_head(&ap->hlist, + hlist_del_rcu(&p->hlist); + hlist_add_head_rcu(&ap->hlist, &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); } /* * This is the second or subsequent kprobe at the address - handle * the intricacies - * TODO: Move kcalloc outside the spinlock + * TODO: Move kcalloc outside the spin_lock */ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) @@ -444,7 +434,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) { arch_disarm_kprobe(p); - hlist_del(&p->hlist); + hlist_del_rcu(&p->hlist); spin_unlock_irqrestore(&kprobe_lock, flags); arch_remove_kprobe(p); } @@ -452,11 +442,10 @@ static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) static inline void cleanup_aggr_kprobe(struct kprobe *old_p, struct kprobe *p, unsigned long flags) { - list_del(&p->list); - if (list_empty(&old_p->list)) { + list_del_rcu(&p->list); + if (list_empty(&old_p->list)) cleanup_kprobe(old_p, flags); - kfree(old_p); - } else + else spin_unlock_irqrestore(&kprobe_lock, flags); } @@ -479,9 +468,9 @@ int __kprobes register_kprobe(struct kprobe *p) if ((ret = arch_prepare_kprobe(p)) != 0) goto rm_kprobe; + p->nmissed = 0; spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); - p->nmissed = 0; if (old_p) { ret = register_aggr_kprobe(old_p, p); goto out; @@ -489,7 +478,7 @@ int __kprobes register_kprobe(struct kprobe *p) arch_copy_kprobe(p); INIT_HLIST_NODE(&p->hlist); - hlist_add_head(&p->hlist, + hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); arch_arm_kprobe(p); @@ -510,10 +499,16 @@ void __kprobes unregister_kprobe(struct kprobe *p) spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); if (old_p) { + /* cleanup_*_kprobe() does the spin_unlock_irqrestore */ if (old_p->pre_handler == aggr_pre_handler) cleanup_aggr_kprobe(old_p, p, flags); else cleanup_kprobe(p, flags); + + synchronize_sched(); + if (old_p->pre_handler == aggr_pre_handler && + list_empty(&old_p->list)) + kfree(old_p); } else spin_unlock_irqrestore(&kprobe_lock, flags); } @@ -590,13 +585,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp) unregister_kprobe(&rp->kp); /* No race here */ - spin_lock_irqsave(&kprobe_lock, flags); + spin_lock_irqsave(&kretprobe_lock, flags); free_rp_inst(rp); while ((ri = get_used_rp_inst(rp)) != NULL) { ri->rp = NULL; hlist_del(&ri->uflist); } - spin_unlock_irqrestore(&kprobe_lock, flags); + spin_unlock_irqrestore(&kretprobe_lock, flags); } static int __init init_kprobes(void) diff --git a/kernel/kthread.c b/kernel/kthread.c index f50f174e92da..e75950a1092c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind); int kthread_stop(struct task_struct *k) { + return kthread_stop_sem(k, NULL); +} +EXPORT_SYMBOL(kthread_stop); + +int kthread_stop_sem(struct task_struct *k, struct semaphore *s) +{ int ret; down(&kthread_stop_lock); @@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k) /* Now set kthread_should_stop() to true, and wake it up. */ kthread_stop_info.k = k; - wake_up_process(k); + if (s) + up(s); + else + wake_up_process(k); put_task_struct(k); /* Once it dies, reset stop ptr, gather result and we're done. */ @@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k) return ret; } -EXPORT_SYMBOL(kthread_stop); +EXPORT_SYMBOL(kthread_stop_sem); static __init int helper_init(void) { diff --git a/kernel/module.c b/kernel/module.c index ff5c500ab625..2ea929d51ad0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -37,6 +37,7 @@ #include <linux/stop_machine.h> #include <linux/device.h> #include <linux/string.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include <asm/semaphore.h> #include <asm/cacheflush.h> diff --git a/kernel/params.c b/kernel/params.c index fbf173215fd2..47ba69547945 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/err.h> +#include <linux/slab.h> #if 0 #define DEBUGP printk @@ -80,8 +81,6 @@ static char *next_arg(char *args, char **param, char **val) int in_quote = 0, quoted = 0; char *next; - /* Chew any extra spaces */ - while (*args == ' ') args++; if (*args == '"') { args++; in_quote = 1; @@ -121,6 +120,10 @@ static char *next_arg(char *args, char **param, char **val) next = args + i + 1; } else next = args + i; + + /* Chew up trailing spaces. */ + while (*next == ' ') + next++; return next; } @@ -135,6 +138,10 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); + /* Chew leading spaces */ + while (*args == ' ') + args++; + while (*args) { int ret; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index ad85d3f0dcc4..cae4f5728997 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -36,7 +36,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp) union cpu_time_count ret; ret.sched = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { ret.cpu = timespec_to_cputime(tp); } @@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ -static inline void bump_cpu_timer(struct k_itimer *timer, +static void bump_cpu_timer(struct k_itimer *timer, union cpu_time_count now) { int i; @@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer, for (i = 0; incr < delta - incr; i++) incr = incr << 1; for (; i >= 0; incr >>= 1, i--) { - if (delta <= incr) + if (delta < incr) continue; timer->it.cpu.expires.sched += incr; timer->it_overrun += 1 << i; @@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer, for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) incr = cputime_add(incr, incr); for (; i >= 0; incr = cputime_halve(incr), i--) { - if (cputime_le(delta, incr)) + if (cputime_lt(delta, incr)) continue; timer->it.cpu.expires.cpu = cputime_add(timer->it.cpu.expires.cpu, incr); @@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) int posix_cpu_timer_del(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; + int ret = 0; - if (timer->it.cpu.firing) - return TIMER_RETRY; - - if (unlikely(p == NULL)) - return 0; - - if (!list_empty(&timer->it.cpu.entry)) { + if (likely(p != NULL)) { read_lock(&tasklist_lock); if (unlikely(p->signal == NULL)) { /* @@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer) */ BUG_ON(!list_empty(&timer->it.cpu.entry)); } else { - /* - * Take us off the task's timer list. - */ spin_lock(&p->sighand->siglock); - list_del(&timer->it.cpu.entry); + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + list_del(&timer->it.cpu.entry); spin_unlock(&p->sighand->siglock); } read_unlock(&tasklist_lock); + + if (!ret) + put_task_struct(p); } - put_task_struct(p); - return 0; + return ret; } /* @@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head, cputime_t ptime = cputime_add(utime, stime); list_for_each_entry_safe(timer, next, head, entry) { - timer->task = NULL; list_del_init(&timer->entry); if (cputime_lt(timer->expires.cpu, ptime)) { timer->expires.cpu = cputime_zero; @@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head, ++head; list_for_each_entry_safe(timer, next, head, entry) { - timer->task = NULL; list_del_init(&timer->entry); if (cputime_lt(timer->expires.cpu, utime)) { timer->expires.cpu = cputime_zero; @@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head, ++head; list_for_each_entry_safe(timer, next, head, entry) { - timer->task = NULL; list_del_init(&timer->entry); if (timer->expires.sched < sched_time) { timer->expires.sched = 0; @@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p, struct task_struct *t = p; unsigned int nthreads = atomic_read(&p->signal->live); + if (!nthreads) + return; + switch (clock_idx) { default: BUG(); @@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p, left = cputime_div(cputime_sub(expires.cpu, val.cpu), nthreads); do { - if (!unlikely(t->exit_state)) { + if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(prof_ticks(t), left); if (cputime_eq(t->it_prof_expires, cputime_zero) || @@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p, left = cputime_div(cputime_sub(expires.cpu, val.cpu), nthreads); do { - if (!unlikely(t->exit_state)) { + if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(virt_ticks(t), left); if (cputime_eq(t->it_virt_expires, cputime_zero) || @@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p, nsleft = expires.sched - val.sched; do_div(nsleft, nthreads); do { - if (!unlikely(t->exit_state)) { + if (likely(!(t->flags & PF_EXITING))) { ns = t->sched_time + nsleft; if (t->it_sched_expires == 0 || t->it_sched_expires > ns) { @@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) struct cpu_timer_list *next; unsigned long i; + if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING)) + return; + head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); @@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { - if (next->expires.sched > nt->expires.sched) { - listpos = &next->entry; + if (next->expires.sched > nt->expires.sched) break; - } + listpos = &next->entry; } } else { list_for_each_entry(next, head, entry) { - if (cputime_gt(next->expires.cpu, nt->expires.cpu)) { - listpos = &next->entry; + if (cputime_gt(next->expires.cpu, nt->expires.cpu)) break; - } + listpos = &next->entry; } } list_add(&nt->entry, listpos); @@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, * Disarm any old timer after extracting its expiry time. */ BUG_ON(!irqs_disabled()); + + ret = 0; spin_lock(&p->sighand->siglock); old_expires = timer->it.cpu.expires; - list_del_init(&timer->it.cpu.entry); + if (unlikely(timer->it.cpu.firing)) { + timer->it.cpu.firing = -1; + ret = TIMER_RETRY; + } else + list_del_init(&timer->it.cpu.entry); spin_unlock(&p->sighand->siglock); /* @@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, } } - if (unlikely(timer->it.cpu.firing)) { + if (unlikely(ret)) { /* * We are colliding with the timer actually firing. * Punt after filling in the timer's old value, and @@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, * it as an overrun (thanks to bump_cpu_timer above). */ read_unlock(&tasklist_lock); - timer->it.cpu.firing = -1; - ret = TIMER_RETRY; goto out; } @@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { + int maxfire; struct list_head *timers = tsk->cpu_timers; + maxfire = 20; tsk->it_prof_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) { + if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { tsk->it_prof_expires = t->expires.cpu; break; } @@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk, } ++timers; + maxfire = 20; tsk->it_virt_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) { + if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { tsk->it_virt_expires = t->expires.cpu; break; } @@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk, } ++timers; + maxfire = 20; tsk->it_sched_expires = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (tsk->sched_time < t->expires.sched) { + if (!--maxfire || tsk->sched_time < t->expires.sched) { tsk->it_sched_expires = t->expires.sched; break; } @@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { + int maxfire; struct signal_struct *const sig = tsk->signal; cputime_t utime, stime, ptime, virt_expires, prof_expires; unsigned long long sched_time, sched_expires; @@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk, } while (t != tsk); ptime = cputime_add(utime, stime); + maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (cputime_lt(ptime, t->expires.cpu)) { + if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { prof_expires = t->expires.cpu; break; } @@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk, } ++timers; + maxfire = 20; virt_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (cputime_lt(utime, t->expires.cpu)) { + if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { virt_expires = t->expires.cpu; break; } @@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk, } ++timers; + maxfire = 20; sched_expires = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (sched_time < t->expires.sched) { + if (!--maxfire || sched_time < t->expires.sched) { sched_expires = t->expires.sched; break; } @@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk, unsigned long long sched_left, sched; const unsigned int nthreads = atomic_read(&sig->live); + if (!nthreads) + return; + prof_left = cputime_sub(prof_expires, utime); prof_left = cputime_sub(prof_left, stime); prof_left = cputime_div(prof_left, nthreads); @@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk, do { t = next_thread(t); - } while (unlikely(t->exit_state)); + } while (unlikely(t->flags & PF_EXITING)); } while (t != tsk); } } @@ -1212,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) /* * The task was cleaned up already, no future firings. */ - return; + goto out; /* * Fetch the current sample and update the timer's expiry time. @@ -1222,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) { clear_dead_task(timer, now); - return; + goto out; } read_lock(&tasklist_lock); /* arm_timer needs it. */ } else { @@ -1235,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) put_task_struct(p); timer->it.cpu.task = p = NULL; timer->it.cpu.expires.sched = 0; - read_unlock(&tasklist_lock); - return; + goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* * We've noticed that the thread is dead, but @@ -1244,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * drop our task ref. */ clear_dead_task(timer, now); - read_unlock(&tasklist_lock); - return; + goto out_unlock; } cpu_clock_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); @@ -1257,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ arm_timer(timer, now); +out_unlock: read_unlock(&tasklist_lock); + +out: + timer->it_overrun_last = timer->it_overrun; + timer->it_overrun = -1; + ++timer->it_requeue_pending; } /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index b7b532acd9fc..5870efb3e200 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -270,7 +270,7 @@ static void tstojiffie(struct timespec *tp, int res, u64 *jiff) long sec = tp->tv_sec; long nsec = tp->tv_nsec + res - 1; - if (nsec > NSEC_PER_SEC) { + if (nsec >= NSEC_PER_SEC) { sec++; nsec -= NSEC_PER_SEC; } @@ -1157,7 +1157,7 @@ retry_delete: } /* - * This is called by __exit_signal, only when there are no more + * This is called by do_exit or de_thread, only when there are no more * references to the shared signal_struct. */ void exit_itimers(struct signal_struct *sig) @@ -1209,13 +1209,9 @@ static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp) do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); - tp->tv_sec += wall_to_mono.tv_sec; - tp->tv_nsec += wall_to_mono.tv_nsec; + set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec, + tp->tv_nsec + wall_to_mono.tv_nsec); - if ((tp->tv_nsec - NSEC_PER_SEC) > 0) { - tp->tv_nsec -= NSEC_PER_SEC; - tp->tv_sec++; - } return 0; } @@ -1295,13 +1291,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) return error; } -static void nanosleep_wake_up(unsigned long __data) -{ - struct task_struct *p = (struct task_struct *) __data; - - wake_up_process(p); -} - /* * The standard says that an absolute nanosleep call MUST wake up at * the requested time in spite of clock settings. Here is what we do: @@ -1442,7 +1431,6 @@ static int common_nsleep(clockid_t which_clock, int flags, struct timespec *tsave) { struct timespec t, dum; - struct timer_list new_timer; DECLARE_WAITQUEUE(abs_wqueue, current); u64 rq_time = (u64)0; s64 left; @@ -1451,10 +1439,6 @@ static int common_nsleep(clockid_t which_clock, ¤t_thread_info()->restart_block; abs_wqueue.flags = 0; - init_timer(&new_timer); - new_timer.expires = 0; - new_timer.data = (unsigned long) current; - new_timer.function = nanosleep_wake_up; abs = flags & TIMER_ABSTIME; if (restart_block->fn == clock_nanosleep_restart) { @@ -1490,13 +1474,8 @@ static int common_nsleep(clockid_t which_clock, if (left < (s64)0) break; - new_timer.expires = jiffies + left; - __set_current_state(TASK_INTERRUPTIBLE); - add_timer(&new_timer); - - schedule(); + schedule_timeout_interruptible(left); - del_timer_sync(&new_timer); left = rq_time - get_jiffies_64(); } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 396c7873e804..5ec248cb7f4a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -19,6 +19,15 @@ config PM will issue the hlt instruction if nothing is to be done, thereby sending the processor to sleep and saving power. +config PM_LEGACY + bool "Legacy Power Management API" + depends on PM + default y + ---help--- + Support for pm_register() and friends. + + If unsure, say Y. + config PM_DEBUG bool "Power Management Debug Support" depends on PM @@ -29,7 +38,7 @@ config PM_DEBUG config SOFTWARE_SUSPEND bool "Software Suspend" - depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP)) + depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP) ---help--- Enable the possibility of suspending the machine. It doesn't need APM. diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 2f438d0eaa13..04be7d0d96a7 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif -obj-y := main.o process.o console.o pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o +obj-y := main.o process.o console.o +obj-$(CONFIG_PM_LEGACY) += pm.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o obj-$(CONFIG_SUSPEND_SMP) += smp.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 2d8bf054d036..027322a564f4 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -17,12 +17,12 @@ #include <linux/delay.h> #include <linux/fs.h> #include <linux/mount.h> +#include <linux/pm.h> #include "power.h" extern suspend_disk_method_t pm_disk_mode; -extern struct pm_ops * pm_ops; extern int swsusp_suspend(void); extern int swsusp_write(void); @@ -30,7 +30,6 @@ extern int swsusp_check(void); extern int swsusp_read(void); extern void swsusp_close(void); extern int swsusp_resume(void); -extern int swsusp_free(void); static int noresume = 0; @@ -49,13 +48,11 @@ dev_t swsusp_resume_device; static void power_down(suspend_disk_method_t mode) { - unsigned long flags; int error = 0; - local_irq_save(flags); switch(mode) { case PM_DISK_PLATFORM: - device_shutdown(); + kernel_power_off_prepare(); error = pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: @@ -95,10 +92,7 @@ static void free_some_memory(void) printk("Freeing memory... "); while ((tmp = shrink_all_memory(10000))) { pages += tmp; - printk("\b%c", p[i]); - i++; - if (i > 3) - i = 0; + printk("\b%c", p[i++ % 4]); } printk("\bdone (%li pages freed)\n", pages); } @@ -180,13 +174,12 @@ int pm_suspend_disk(void) goto Done; if (in_suspend) { + device_resume(); pr_debug("PM: writing image.\n"); error = swsusp_write(); if (!error) power_down(pm_disk_mode); else { - /* swsusp_write can not fail in device_resume, - no need to do second device_resume */ swsusp_free(); unprepare_processes(); return error; @@ -254,14 +247,17 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read())) - goto Cleanup; + if ((error = swsusp_read())) { + swsusp_free(); + goto Thaw; + } pr_debug("PM: Preparing devices for restore.\n"); if ((error = device_suspend(PMSG_FREEZE))) { printk("Some devices failed to suspend\n"); - goto Free; + swsusp_free(); + goto Thaw; } mb(); @@ -270,9 +266,7 @@ static int software_resume(void) swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); device_resume(); - Free: - swsusp_free(); - Cleanup: + Thaw: unprepare_processes(); Done: /* For success case, the suspend path will release the lock */ diff --git a/kernel/power/main.c b/kernel/power/main.c index 22bdc93cc038..d253f3ae2fa5 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -24,7 +24,7 @@ DECLARE_MUTEX(pm_sem); -struct pm_ops * pm_ops = NULL; +struct pm_ops *pm_ops; suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; /** @@ -151,6 +151,18 @@ static char *pm_states[PM_SUSPEND_MAX] = { #endif }; +static inline int valid_state(suspend_state_t state) +{ + /* Suspend-to-disk does not really need low-level support. + * It can work with reboot if needed. */ + if (state == PM_SUSPEND_DISK) + return 1; + + if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) + return 0; + return 1; +} + /** * enter_state - Do common work of entering low-power state. @@ -167,6 +179,8 @@ static int enter_state(suspend_state_t state) { int error; + if (!valid_state(state)) + return -ENODEV; if (down_trylock(&pm_sem)) return -EBUSY; @@ -236,8 +250,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) char * s = buf; for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i]) - s += sprintf(s,"%s ",pm_states[i]); + if (pm_states[i] && valid_state(i)) + s += sprintf(s,"%s ", pm_states[i]); } s += sprintf(s,"\n"); return (s - buf); diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 159149321b3c..33c508e857dd 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/pm.h> +#include <linux/pm_legacy.h> #include <linux/interrupt.h> int pm_active; diff --git a/kernel/power/power.h b/kernel/power/power.h index cd6a3493cc0d..6c042b5ee14b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,7 +1,7 @@ #include <linux/suspend.h> #include <linux/utsname.h> -/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but +/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but we probably do not take enough locks for switching consoles, etc, so bad things might happen. */ @@ -9,6 +9,9 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) #endif +#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ + - 4 - 3*sizeof(unsigned long) - sizeof(int) \ + - sizeof(void *)) / sizeof(swp_entry_t)) struct swsusp_info { struct new_utsname uts; @@ -18,7 +21,7 @@ struct swsusp_info { unsigned long image_pages; unsigned long pagedir_pages; suspend_pagedir_t * suspend_pagedir; - swp_entry_t pagedir[768]; + swp_entry_t pagedir[MAX_PBES]; } __attribute__((aligned(PAGE_SIZE))); @@ -50,3 +53,20 @@ extern void thaw_processes(void); extern int pm_prepare_console(void); extern void pm_restore_console(void); + + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +extern unsigned int nr_copy_pages; +extern suspend_pagedir_t *pagedir_nosave; +extern suspend_pagedir_t *pagedir_save; + +extern asmlinkage int swsusp_arch_suspend(void); +extern asmlinkage int swsusp_arch_resume(void); + +extern void free_pagedir(struct pbe *pblist); +extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); +extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); +extern void swsusp_free(void); +extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c new file mode 100644 index 000000000000..4a6dbcefd378 --- /dev/null +++ b/kernel/power/snapshot.c @@ -0,0 +1,453 @@ +/* + * linux/kernel/power/snapshot.c + * + * This file provide system snapshot/restore functionality. + * + * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * + * This file is released under the GPLv2, and is based on swsusp.c. + * + */ + + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/suspend.h> +#include <linux/smp_lock.h> +#include <linux/delay.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/pm.h> +#include <linux/device.h> +#include <linux/bootmem.h> +#include <linux/syscalls.h> +#include <linux/console.h> +#include <linux/highmem.h> + +#include <asm/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +#include "power.h" + +#ifdef CONFIG_HIGHMEM +struct highmem_page { + char *data; + struct page *page; + struct highmem_page *next; +}; + +static struct highmem_page *highmem_copy; + +static int save_highmem_zone(struct zone *zone) +{ + unsigned long zone_pfn; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + struct page *page; + struct highmem_page *save; + void *kaddr; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + + if (!(pfn%1000)) + printk("."); + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + /* + * This condition results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. This should be + * corrected eventually when the cases giving rise to this + * are better understood. + */ + if (PageReserved(page)) { + printk("highmem reserved page?!\n"); + continue; + } + BUG_ON(PageNosave(page)); + if (PageNosaveFree(page)) + continue; + save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); + if (!save) + return -ENOMEM; + save->next = highmem_copy; + save->page = page; + save->data = (void *) get_zeroed_page(GFP_ATOMIC); + if (!save->data) { + kfree(save); + return -ENOMEM; + } + kaddr = kmap_atomic(page, KM_USER0); + memcpy(save->data, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + highmem_copy = save; + } + return 0; +} + +int save_highmem(void) +{ + struct zone *zone; + int res = 0; + + pr_debug("swsusp: Saving Highmem\n"); + for_each_zone (zone) { + if (is_highmem(zone)) + res = save_highmem_zone(zone); + if (res) + return res; + } + return 0; +} + +int restore_highmem(void) +{ + printk("swsusp: Restoring Highmem\n"); + while (highmem_copy) { + struct highmem_page *save = highmem_copy; + void *kaddr; + highmem_copy = save->next; + + kaddr = kmap_atomic(save->page, KM_USER0); + memcpy(kaddr, save->data, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + free_page((long) save->data); + kfree(save); + } + return 0; +} +#endif + +static int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +/** + * saveable - Determine whether a page should be cloned or not. + * @pfn: The page + * + * We save a page if it's Reserved, and not in the range of pages + * statically defined as 'unsaveable', or if it isn't reserved, and + * isn't part of a free chunk of pages. + */ + +static int saveable(struct zone *zone, unsigned long *zone_pfn) +{ + unsigned long pfn = *zone_pfn + zone->zone_start_pfn; + struct page *page; + + if (!pfn_valid(pfn)) + return 0; + + page = pfn_to_page(pfn); + BUG_ON(PageReserved(page) && PageNosave(page)); + if (PageNosave(page)) + return 0; + if (PageReserved(page) && pfn_is_nosave(pfn)) { + pr_debug("[nosave pfn 0x%lx]", pfn); + return 0; + } + if (PageNosaveFree(page)) + return 0; + + return 1; +} + +static unsigned count_data_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + unsigned int n = 0; + + for_each_zone (zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + n += saveable(zone, &zone_pfn); + } + return n; +} + +static void copy_data_pages(struct pbe *pblist) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe *pbe, *p; + + pbe = pblist; + for_each_zone (zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + /* This is necessary for swsusp_free() */ + for_each_pb_page (p, pblist) + SetPageNosaveFree(virt_to_page(p)); + for_each_pbe (p, pblist) + SetPageNosaveFree(virt_to_page(p->address)); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + if (saveable(zone, &zone_pfn)) { + struct page *page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + BUG_ON(!pbe); + pbe->orig_address = (unsigned long)page_address(page); + /* copy_page is not usable for copying task structs. */ + memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); + pbe = pbe->next; + } + } + } + BUG_ON(pbe); +} + + +/** + * free_pagedir - free pages allocated with alloc_pagedir() + */ + +void free_pagedir(struct pbe *pblist) +{ + struct pbe *pbe; + + while (pblist) { + pbe = (pblist + PB_PAGE_SKIP)->next; + ClearPageNosave(virt_to_page(pblist)); + ClearPageNosaveFree(virt_to_page(pblist)); + free_page((unsigned long)pblist); + pblist = pbe; + } +} + +/** + * fill_pb_page - Create a list of PBEs on a given memory page + */ + +static inline void fill_pb_page(struct pbe *pbpage) +{ + struct pbe *p; + + p = pbpage; + pbpage += PB_PAGE_SKIP; + do + p->next = p + 1; + while (++p < pbpage); +} + +/** + * create_pbe_list - Create a list of PBEs on top of a given chain + * of memory pages allocated with alloc_pagedir() + */ + +void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) +{ + struct pbe *pbpage, *p; + unsigned int num = PBES_PER_PAGE; + + for_each_pb_page (pbpage, pblist) { + if (num >= nr_pages) + break; + + fill_pb_page(pbpage); + num += PBES_PER_PAGE; + } + if (pbpage) { + for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) + p->next = p + 1; + p->next = NULL; + } + pr_debug("create_pbe_list(): initialized %d PBEs\n", num); +} + +/** + * @safe_needed - on resume, for storing the PBE list and the image, + * we can only use memory pages that do not conflict with the pages + * which had been used before suspend. + * + * The unsafe pages are marked with the PG_nosave_free flag + * + * Allocated but unusable (ie eaten) memory pages should be marked + * so that swsusp_free() can release them + */ + +static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) +{ + void *res; + + if (safe_needed) + do { + res = (void *)get_zeroed_page(gfp_mask); + if (res && PageNosaveFree(virt_to_page(res))) + /* This is for swsusp_free() */ + SetPageNosave(virt_to_page(res)); + } while (res && PageNosaveFree(virt_to_page(res))); + else + res = (void *)get_zeroed_page(gfp_mask); + if (res) { + SetPageNosave(virt_to_page(res)); + SetPageNosaveFree(virt_to_page(res)); + } + return res; +} + +unsigned long get_safe_page(gfp_t gfp_mask) +{ + return (unsigned long)alloc_image_page(gfp_mask, 1); +} + +/** + * alloc_pagedir - Allocate the page directory. + * + * First, determine exactly how many pages we need and + * allocate them. + * + * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE + * struct pbe elements (pbes) and the last element in the page points + * to the next page. + * + * On each page we set up a list of struct_pbe elements. + */ + +struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) +{ + unsigned int num; + struct pbe *pblist, *pbe; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); + pblist = alloc_image_page(gfp_mask, safe_needed); + /* FIXME: rewrite this ugly loop */ + for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; + pbe = pbe->next, num += PBES_PER_PAGE) { + pbe += PB_PAGE_SKIP; + pbe->next = alloc_image_page(gfp_mask, safe_needed); + } + if (!pbe) { /* get_zeroed_page() failed */ + free_pagedir(pblist); + pblist = NULL; + } + return pblist; +} + +/** + * Free pages we allocated for suspend. Suspend pages are alocated + * before atomic copy, so we need to free them after resume. + */ + +void swsusp_free(void) +{ + struct zone *zone; + unsigned long zone_pfn; + + for_each_zone(zone) { + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { + struct page *page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + if (PageNosave(page) && PageNosaveFree(page)) { + ClearPageNosave(page); + ClearPageNosaveFree(page); + free_page((long) page_address(page)); + } + } + } +} + + +/** + * enough_free_mem - Make sure we enough free memory to snapshot. + * + * Returns TRUE or FALSE after checking the number of available + * free pages. + */ + +static int enough_free_mem(unsigned int nr_pages) +{ + pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); + return nr_free_pages() > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); +} + +int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) +{ + struct pbe *p; + + for_each_pbe (p, pblist) { + p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed); + if (!p->address) + return -ENOMEM; + } + return 0; +} + +static struct pbe *swsusp_alloc(unsigned int nr_pages) +{ + struct pbe *pblist; + + if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) { + printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); + return NULL; + } + create_pbe_list(pblist, nr_pages); + + if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { + printk(KERN_ERR "suspend: Allocating image pages failed.\n"); + swsusp_free(); + return NULL; + } + + return pblist; +} + +asmlinkage int swsusp_save(void) +{ + unsigned int nr_pages; + + pr_debug("swsusp: critical section: \n"); + + drain_local_pages(); + nr_pages = count_data_pages(); + printk("swsusp: Need to copy %u pages\n", nr_pages); + + pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", + nr_pages, + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, + PAGES_FOR_IO, nr_free_pages()); + + /* This is needed because of the fixed size of swsusp_info */ + if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) + return -ENOSPC; + + if (!enough_free_mem(nr_pages)) { + printk(KERN_ERR "swsusp: Not enough free memory\n"); + return -ENOMEM; + } + + pagedir_nosave = swsusp_alloc(nr_pages); + if (!pagedir_nosave) + return -ENOMEM; + + /* During allocating of suspend pagedir, new cold pages may appear. + * Kill them. + */ + drain_local_pages(); + copy_data_pages(pagedir_nosave); + + /* + * End of critical section. From now on, we can write to memory, + * but we should not touch disk. This specially means we must _not_ + * touch swap space! Except we must write out our image of course. + */ + + nr_copy_pages = nr_pages; + + printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); + return 0; +} diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index d967e875ee82..c05f46e7348f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -1,11 +1,10 @@ /* * linux/kernel/power/swsusp.c * - * This file is to realize architecture-independent - * machine suspend feature using pretty near only high-level routines + * This file provides code to write suspend image to swap and read it back. * * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> * * This file is released under the GPLv2. * @@ -47,11 +46,7 @@ #include <linux/utsname.h> #include <linux/version.h> #include <linux/delay.h> -#include <linux/reboot.h> #include <linux/bitops.h> -#include <linux/vt_kern.h> -#include <linux/kbd_kern.h> -#include <linux/keyboard.h> #include <linux/spinlock.h> #include <linux/genhd.h> #include <linux/kernel.h> @@ -63,10 +58,8 @@ #include <linux/swapops.h> #include <linux/bootmem.h> #include <linux/syscalls.h> -#include <linux/console.h> #include <linux/highmem.h> #include <linux/bio.h> -#include <linux/mount.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -80,36 +73,31 @@ #include "power.h" +#ifdef CONFIG_HIGHMEM +int save_highmem(void); +int restore_highmem(void); +#else +static int save_highmem(void) { return 0; } +static int restore_highmem(void) { return 0; } +#endif + #define CIPHER "aes" #define MAXKEY 32 #define MAXIV 32 -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - -/* Variables to be preserved over suspend */ -static int nr_copy_pages_check; - extern char resume_file[]; /* Local variables that should not be affected by save */ -static unsigned int nr_copy_pages __nosavedata = 0; +unsigned int nr_copy_pages __nosavedata = 0; /* Suspend pagedir is allocated before final copy, therefore it must be freed after resume - Warning: this is evil. There are actually two pagedirs at time of - resume. One is "pagedir_save", which is empty frame allocated at - time of suspend, that must be freed. Second is "pagedir_nosave", - allocated at time of resume, that travels through memory not to - collide with anything. - Warning: this is even more evil than it seems. Pagedirs this file talks about are completely different from page directories used by MMU hardware. */ suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; -static suspend_pagedir_t *pagedir_save; #define SWSUSP_SIG "S1SUSPEND" @@ -124,12 +112,6 @@ static struct swsusp_header { static struct swsusp_info swsusp_info; /* - * XXX: We try to keep some more pages free so that I/O operations succeed - * without paging. Might this be more? - */ -#define PAGES_FOR_IO 512 - -/* * Saving part... */ @@ -141,8 +123,8 @@ static struct swsusp_info swsusp_info; static unsigned short swapfile_used[MAX_SWAPFILES]; static unsigned short root_swap; -static int write_page(unsigned long addr, swp_entry_t * loc); -static int bio_read_page(pgoff_t page_off, void * page); +static int write_page(unsigned long addr, swp_entry_t *loc); +static int bio_read_page(pgoff_t page_off, void *page); static u8 key_iv[MAXKEY+MAXIV]; @@ -363,7 +345,7 @@ static void lock_swapdevices(void) } /** - * write_swap_page - Write one page to a fresh swap location. + * write_page - Write one page to a fresh swap location. * @addr: Address we're writing. * @loc: Place to store the entry we used. * @@ -374,7 +356,7 @@ static void lock_swapdevices(void) * This is a partial improvement, since we will at least return other * errors, though we need to eventually fix the damn code. */ -static int write_page(unsigned long addr, swp_entry_t * loc) +static int write_page(unsigned long addr, swp_entry_t *loc) { swp_entry_t entry; int error = 0; @@ -402,15 +384,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc) static void data_free(void) { swp_entry_t entry; - int i; + struct pbe *p; - for (i = 0; i < nr_copy_pages; i++) { - entry = (pagedir_nosave + i)->swap_address; + for_each_pbe (p, pagedir_nosave) { + entry = p->swap_address; if (entry.val) swap_free(entry); else break; - (pagedir_nosave + i)->swap_address = (swp_entry_t){0}; } } @@ -512,8 +493,8 @@ static void free_pagedir_entries(void) static int write_pagedir(void) { int error = 0; - unsigned n = 0; - struct pbe * pbe; + unsigned int n = 0; + struct pbe *pbe; printk( "Writing pagedir..."); for_each_pb_page (pbe, pagedir_nosave) { @@ -527,6 +508,26 @@ static int write_pagedir(void) } /** + * enough_swap - Make sure we have enough swap to save the image. + * + * Returns TRUE or FALSE after checking the total amount of swap + * space avaiable. + * + * FIXME: si_swapinfo(&i) returns all swap devices information. + * We should only consider resume_device. + */ + +static int enough_swap(unsigned int nr_pages) +{ + struct sysinfo i; + + si_swapinfo(&i); + pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); + return i.freeswap > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); +} + +/** * write_suspend_image - Write entire image and metadata. * */ @@ -534,6 +535,11 @@ static int write_suspend_image(void) { int error; + if (!enough_swap(nr_copy_pages)) { + printk(KERN_ERR "swsusp: Not enough free swap\n"); + return -ENOSPC; + } + init_header(); if ((error = data_write())) goto FreeData; @@ -553,433 +559,6 @@ static int write_suspend_image(void) goto Done; } - -#ifdef CONFIG_HIGHMEM -struct highmem_page { - char *data; - struct page *page; - struct highmem_page *next; -}; - -static struct highmem_page *highmem_copy; - -static int save_highmem_zone(struct zone *zone) -{ - unsigned long zone_pfn; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - struct page *page; - struct highmem_page *save; - void *kaddr; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - - if (!(pfn%1000)) - printk("."); - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - /* - * This condition results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. This should be - * corrected eventually when the cases giving rise to this - * are better understood. - */ - if (PageReserved(page)) { - printk("highmem reserved page?!\n"); - continue; - } - BUG_ON(PageNosave(page)); - if (PageNosaveFree(page)) - continue; - save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); - if (!save) - return -ENOMEM; - save->next = highmem_copy; - save->page = page; - save->data = (void *) get_zeroed_page(GFP_ATOMIC); - if (!save->data) { - kfree(save); - return -ENOMEM; - } - kaddr = kmap_atomic(page, KM_USER0); - memcpy(save->data, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - highmem_copy = save; - } - return 0; -} -#endif /* CONFIG_HIGHMEM */ - - -static int save_highmem(void) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - int res = 0; - - pr_debug("swsusp: Saving Highmem\n"); - for_each_zone (zone) { - if (is_highmem(zone)) - res = save_highmem_zone(zone); - if (res) - return res; - } -#endif - return 0; -} - -static int restore_highmem(void) -{ -#ifdef CONFIG_HIGHMEM - printk("swsusp: Restoring Highmem\n"); - while (highmem_copy) { - struct highmem_page *save = highmem_copy; - void *kaddr; - highmem_copy = save->next; - - kaddr = kmap_atomic(save->page, KM_USER0); - memcpy(kaddr, save->data, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - free_page((long) save->data); - kfree(save); - } -#endif - return 0; -} - - -static int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -/** - * saveable - Determine whether a page should be cloned or not. - * @pfn: The page - * - * We save a page if it's Reserved, and not in the range of pages - * statically defined as 'unsaveable', or if it isn't reserved, and - * isn't part of a free chunk of pages. - */ - -static int saveable(struct zone * zone, unsigned long * zone_pfn) -{ - unsigned long pfn = *zone_pfn + zone->zone_start_pfn; - struct page * page; - - if (!pfn_valid(pfn)) - return 0; - - page = pfn_to_page(pfn); - BUG_ON(PageReserved(page) && PageNosave(page)); - if (PageNosave(page)) - return 0; - if (PageReserved(page) && pfn_is_nosave(pfn)) { - pr_debug("[nosave pfn 0x%lx]", pfn); - return 0; - } - if (PageNosaveFree(page)) - return 0; - - return 1; -} - -static void count_data_pages(void) -{ - struct zone *zone; - unsigned long zone_pfn; - - nr_copy_pages = 0; - - for_each_zone (zone) { - if (is_highmem(zone)) - continue; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - nr_copy_pages += saveable(zone, &zone_pfn); - } -} - - -static void copy_data_pages(void) -{ - struct zone *zone; - unsigned long zone_pfn; - struct pbe * pbe = pagedir_nosave; - - pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); - for_each_zone (zone) { - if (is_highmem(zone)) - continue; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - if (saveable(zone, &zone_pfn)) { - struct page * page; - page = pfn_to_page(zone_pfn + zone->zone_start_pfn); - BUG_ON(!pbe); - pbe->orig_address = (long) page_address(page); - /* copy_page is not usable for copying task structs. */ - memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); - pbe = pbe->next; - } - } - } - BUG_ON(pbe); -} - - -/** - * calc_nr - Determine the number of pages needed for a pbe list. - */ - -static int calc_nr(int nr_copy) -{ - return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); -} - -/** - * free_pagedir - free pages allocated with alloc_pagedir() - */ - -static inline void free_pagedir(struct pbe *pblist) -{ - struct pbe *pbe; - - while (pblist) { - pbe = (pblist + PB_PAGE_SKIP)->next; - free_page((unsigned long)pblist); - pblist = pbe; - } -} - -/** - * fill_pb_page - Create a list of PBEs on a given memory page - */ - -static inline void fill_pb_page(struct pbe *pbpage) -{ - struct pbe *p; - - p = pbpage; - pbpage += PB_PAGE_SKIP; - do - p->next = p + 1; - while (++p < pbpage); -} - -/** - * create_pbe_list - Create a list of PBEs on top of a given chain - * of memory pages allocated with alloc_pagedir() - */ - -static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) -{ - struct pbe *pbpage, *p; - unsigned num = PBES_PER_PAGE; - - for_each_pb_page (pbpage, pblist) { - if (num >= nr_pages) - break; - - fill_pb_page(pbpage); - num += PBES_PER_PAGE; - } - if (pbpage) { - for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) - p->next = p + 1; - p->next = NULL; - } - pr_debug("create_pbe_list(): initialized %d PBEs\n", num); -} - -/** - * alloc_pagedir - Allocate the page directory. - * - * First, determine exactly how many pages we need and - * allocate them. - * - * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE - * struct pbe elements (pbes) and the last element in the page points - * to the next page. - * - * On each page we set up a list of struct_pbe elements. - */ - -static struct pbe * alloc_pagedir(unsigned nr_pages) -{ - unsigned num; - struct pbe *pblist, *pbe; - - if (!nr_pages) - return NULL; - - pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); - pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; - pbe = pbe->next, num += PBES_PER_PAGE) { - pbe += PB_PAGE_SKIP; - pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - } - if (!pbe) { /* get_zeroed_page() failed */ - free_pagedir(pblist); - pblist = NULL; - } - return pblist; -} - -/** - * free_image_pages - Free pages allocated for snapshot - */ - -static void free_image_pages(void) -{ - struct pbe * p; - - for_each_pbe (p, pagedir_save) { - if (p->address) { - ClearPageNosave(virt_to_page(p->address)); - free_page(p->address); - p->address = 0; - } - } -} - -/** - * alloc_image_pages - Allocate pages for the snapshot. - */ - -static int alloc_image_pages(void) -{ - struct pbe * p; - - for_each_pbe (p, pagedir_save) { - p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - if (!p->address) - return -ENOMEM; - SetPageNosave(virt_to_page(p->address)); - } - return 0; -} - -void swsusp_free(void) -{ - BUG_ON(PageNosave(virt_to_page(pagedir_save))); - BUG_ON(PageNosaveFree(virt_to_page(pagedir_save))); - free_image_pages(); - free_pagedir(pagedir_save); -} - - -/** - * enough_free_mem - Make sure we enough free memory to snapshot. - * - * Returns TRUE or FALSE after checking the number of available - * free pages. - */ - -static int enough_free_mem(void) -{ - if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) { - pr_debug("swsusp: Not enough free pages: Have %d\n", - nr_free_pages()); - return 0; - } - return 1; -} - - -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable. - * - * FIXME: si_swapinfo(&i) returns all swap devices information. - * We should only consider resume_device. - */ - -static int enough_swap(void) -{ - struct sysinfo i; - - si_swapinfo(&i); - if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { - pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); - return 0; - } - return 1; -} - -static int swsusp_alloc(void) -{ - int error; - - pagedir_nosave = NULL; - nr_copy_pages = calc_nr(nr_copy_pages); - - pr_debug("suspend: (pages needed: %d + %d free: %d)\n", - nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); - - if (!enough_free_mem()) - return -ENOMEM; - - if (!enough_swap()) - return -ENOSPC; - - if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { - printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); - return -ENOMEM; - } - create_pbe_list(pagedir_save, nr_copy_pages); - pagedir_nosave = pagedir_save; - if ((error = alloc_image_pages())) { - printk(KERN_ERR "suspend: Allocating image pages failed.\n"); - swsusp_free(); - return error; - } - - nr_copy_pages_check = nr_copy_pages; - return 0; -} - -static int suspend_prepare_image(void) -{ - int error; - - pr_debug("swsusp: critical section: \n"); - if (save_highmem()) { - printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n"); - restore_highmem(); - return -ENOMEM; - } - - drain_local_pages(); - count_data_pages(); - printk("swsusp: Need to copy %u pages\n", nr_copy_pages); - - error = swsusp_alloc(); - if (error) - return error; - - /* During allocating of suspend pagedir, new cold pages may appear. - * Kill them. - */ - drain_local_pages(); - copy_data_pages(); - - /* - * End of critical section. From now on, we can write to memory, - * but we should not touch disk. This specially means we must _not_ - * touch swap space! Except we must write out our image of course. - */ - - printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages ); - return 0; -} - - /* It is important _NOT_ to umount filesystems at this point. We want * them synced (in case something goes wrong) but we DO not want to mark * filesystem clean: it is not. (And it does not matter, if we resume @@ -988,28 +567,24 @@ static int suspend_prepare_image(void) int swsusp_write(void) { int error; - device_resume(); + + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); + return error; + } lock_swapdevices(); error = write_suspend_image(); /* This will unlock ignored swap devices since writing is finished */ lock_swapdevices(); return error; - } -extern asmlinkage int swsusp_arch_suspend(void); -extern asmlinkage int swsusp_arch_resume(void); - - -asmlinkage int swsusp_save(void) -{ - return suspend_prepare_image(); -} int swsusp_suspend(void) { int error; + if ((error = arch_prepare_suspend())) return error; local_irq_disable(); @@ -1021,15 +596,12 @@ int swsusp_suspend(void) */ if ((error = device_power_down(PMSG_FREEZE))) { printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); - local_irq_enable(); - return error; + goto Enable_irqs; } - if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); - device_power_up(); - local_irq_enable(); - return error; + if ((error = save_highmem())) { + printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); + goto Restore_highmem; } save_processor_state(); @@ -1037,9 +609,10 @@ int swsusp_suspend(void) printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); - BUG_ON (nr_copy_pages_check != nr_copy_pages); +Restore_highmem: restore_highmem(); device_power_up(); +Enable_irqs: local_irq_enable(); return error; } @@ -1057,6 +630,11 @@ int swsusp_resume(void) * execution continues at place where swsusp_arch_suspend was called */ BUG_ON(!error); + /* The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures + */ + swsusp_free(); restore_processor_state(); restore_highmem(); touch_softlockup_watchdog(); @@ -1066,158 +644,43 @@ int swsusp_resume(void) } /** - * On resume, for storing the PBE list and the image, - * we can only use memory pages that do not conflict with the pages - * which had been used before suspend. - * - * We don't know which pages are usable until we allocate them. - * - * Allocated but unusable (ie eaten) memory pages are linked together - * to create a list, so that we can free them easily - * - * We could have used a type other than (void *) - * for this purpose, but ... + * mark_unsafe_pages - mark the pages that cannot be used for storing + * the image during resume, because they conflict with the pages that + * had been used before suspend */ -static void **eaten_memory = NULL; - -static inline void eat_page(void *page) -{ - void **c; - c = eaten_memory; - eaten_memory = page; - *eaten_memory = c; -} - -static unsigned long get_usable_page(unsigned gfp_mask) -{ - unsigned long m; - - m = get_zeroed_page(gfp_mask); - while (!PageNosaveFree(virt_to_page(m))) { - eat_page((void *)m); - m = get_zeroed_page(gfp_mask); - if (!m) - break; - } - return m; -} - -static void free_eaten_memory(void) -{ - unsigned long m; - void **c; - int i = 0; - - c = eaten_memory; - while (c) { - m = (unsigned long)c; - c = *c; - free_page(m); - i++; - } - eaten_memory = NULL; - pr_debug("swsusp: %d unused pages freed\n", i); -} - -/** - * check_pagedir - We ensure here that pages that the PBEs point to - * won't collide with pages where we're going to restore from the loaded - * pages later - */ - -static int check_pagedir(struct pbe *pblist) -{ - struct pbe *p; - - /* This is necessary, so that we can free allocated pages - * in case of failure - */ - for_each_pbe (p, pblist) - p->address = 0UL; - - for_each_pbe (p, pblist) { - p->address = get_usable_page(GFP_ATOMIC); - if (!p->address) - return -ENOMEM; - } - return 0; -} - -/** - * swsusp_pagedir_relocate - It is possible, that some memory pages - * occupied by the list of PBEs collide with pages where we're going to - * restore from the loaded pages later. We relocate them here. - */ - -static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) +static void mark_unsafe_pages(struct pbe *pblist) { struct zone *zone; unsigned long zone_pfn; - struct pbe *pbpage, *tail, *p; - void *m; - int rel = 0, error = 0; + struct pbe *p; if (!pblist) /* a sanity check */ - return NULL; - - pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", - swsusp_info.pagedir_pages); - - /* Set page flags */ + return; + /* Clear page flags */ for_each_zone (zone) { - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - SetPageNosaveFree(pfn_to_page(zone_pfn + + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) + ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); } - /* Clear orig addresses */ - + /* Mark orig addresses */ for_each_pbe (p, pblist) - ClearPageNosaveFree(virt_to_page(p->orig_address)); - - tail = pblist + PB_PAGE_SKIP; - - /* Relocate colliding pages */ - - for_each_pb_page (pbpage, pblist) { - if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { - m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); - if (!m) { - error = -ENOMEM; - break; - } - memcpy(m, (void *)pbpage, PAGE_SIZE); - if (pbpage == pblist) - pblist = (struct pbe *)m; - else - tail->next = (struct pbe *)m; - - eat_page((void *)pbpage); - pbpage = (struct pbe *)m; - - /* We have to link the PBEs again */ + SetPageNosaveFree(virt_to_page(p->orig_address)); - for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) - if (p->next) /* needed to save the end */ - p->next = p + 1; - - rel++; - } - tail = pbpage + PB_PAGE_SKIP; - } +} - if (error) { - printk("\nswsusp: Out of memory\n\n"); - free_pagedir(pblist); - free_eaten_memory(); - pblist = NULL; +static void copy_page_backup_list(struct pbe *dst, struct pbe *src) +{ + /* We assume both lists contain the same number of elements */ + while (src) { + dst->orig_address = src->orig_address; + dst->swap_address = src->swap_address; + dst = dst->next; + src = src->next; } - else - printk("swsusp: Relocated %d pages\n", rel); - - return pblist; } /* @@ -1231,7 +694,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) static atomic_t io_done = ATOMIC_INIT(0); -static int end_io(struct bio * bio, unsigned int num, int err) +static int end_io(struct bio *bio, unsigned int num, int err) { if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) panic("I/O error reading memory image"); @@ -1239,7 +702,7 @@ static int end_io(struct bio * bio, unsigned int num, int err) return 0; } -static struct block_device * resume_bdev; +static struct block_device *resume_bdev; /** * submit - submit BIO request. @@ -1252,10 +715,10 @@ static struct block_device * resume_bdev; * Then submit it and wait. */ -static int submit(int rw, pgoff_t page_off, void * page) +static int submit(int rw, pgoff_t page_off, void *page) { int error = 0; - struct bio * bio; + struct bio *bio; bio = bio_alloc(GFP_ATOMIC, 1); if (!bio) @@ -1284,12 +747,12 @@ static int submit(int rw, pgoff_t page_off, void * page) return error; } -static int bio_read_page(pgoff_t page_off, void * page) +static int bio_read_page(pgoff_t page_off, void *page) { return submit(READ, page_off, page); } -static int bio_write_page(pgoff_t page_off, void * page) +static int bio_write_page(pgoff_t page_off, void *page) { return submit(WRITE, page_off, page); } @@ -1299,7 +762,7 @@ static int bio_write_page(pgoff_t page_off, void * page) * I really don't think that it's foolproof but more than nothing.. */ -static const char * sanity_check(void) +static const char *sanity_check(void) { dump_info(); if (swsusp_info.version_code != LINUX_VERSION_CODE) @@ -1325,7 +788,7 @@ static const char * sanity_check(void) static int check_header(void) { - const char * reason = NULL; + const char *reason = NULL; int error; if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) @@ -1356,7 +819,7 @@ static int check_sig(void) * Reset swap signature now. */ error = bio_write_page(0, &swsusp_header); - } else { + } else { return -EINVAL; } if (!error) @@ -1373,7 +836,7 @@ static int check_sig(void) static int data_read(struct pbe *pblist) { - struct pbe * p; + struct pbe *p; int error = 0; int i = 0; int mod = swsusp_info.image_pages / 100; @@ -1411,7 +874,7 @@ static int data_read(struct pbe *pblist) static int read_pagedir(struct pbe *pblist) { struct pbe *pbpage, *p; - unsigned i = 0; + unsigned int i = 0; int error; if (!pblist) @@ -1433,10 +896,8 @@ static int read_pagedir(struct pbe *pblist) break; } - if (error) - free_page((unsigned long)pblist); - - BUG_ON(i != swsusp_info.pagedir_pages); + if (!error) + BUG_ON(i != swsusp_info.pagedir_pages); return error; } @@ -1460,32 +921,29 @@ static int read_suspend_image(void) int error = 0; struct pbe *p; - if (!(p = alloc_pagedir(nr_copy_pages))) + if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) return -ENOMEM; if ((error = read_pagedir(p))) return error; - create_pbe_list(p, nr_copy_pages); - - if (!(pagedir_nosave = swsusp_pagedir_relocate(p))) + mark_unsafe_pages(p); + pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); + if (pagedir_nosave) { + create_pbe_list(pagedir_nosave, nr_copy_pages); + copy_page_backup_list(pagedir_nosave, p); + } + free_pagedir(p); + if (!pagedir_nosave) return -ENOMEM; /* Allocate memory for the image and read the data from swap */ - error = check_pagedir(pagedir_nosave); - free_eaten_memory(); + error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); + if (!error) error = data_read(pagedir_nosave); - if (error) { /* We fail cleanly */ - for_each_pbe (p, pagedir_nosave) - if (p->address) { - free_page(p->address); - p->address = 0UL; - } - free_pagedir(pagedir_nosave); - } return error; } diff --git a/kernel/printk.c b/kernel/printk.c index 4b8f0f9230a4..5287be83e3e7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -10,7 +10,7 @@ * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. - * Fixed SMP synchronization, 08/08/99, Manfred Spraul + * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfreds@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton <andrewm@uow.edu.au> @@ -148,7 +148,7 @@ static int __init console_setup(char *str) if (!strcmp(str, "ttyb")) strcpy(name, "ttyS1"); #endif - for(s = name; *s; s++) + for (s = name; *s; s++) if ((*s >= '0' && *s <= '9') || *s == ',') break; idx = simple_strtoul(s, NULL, 10); @@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str) size = roundup_pow_of_two(size); if (size > log_buf_len) { unsigned long start, dest_idx, offset; - char * new_log_buf; + char *new_log_buf; new_log_buf = alloc_bootmem(size); if (!new_log_buf) { - printk("log_buf_len: allocation failed\n"); + printk(KERN_WARNING "log_buf_len: allocation failed\n"); goto out; } @@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str) log_end -= offset; spin_unlock_irqrestore(&logbuf_lock, flags); - printk("log_buf_len: %d\n", log_buf_len); + printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); } out: - return 1; } @@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup); * 9 -- Return number of unread characters in the log buffer * 10 -- Return size of the log buffer */ -int do_syslog(int type, char __user * buf, int len) +int do_syslog(int type, char __user *buf, int len) { unsigned long i, j, limit, count; int do_clear = 0; @@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len) error = -EFAULT; goto out; } - error = wait_event_interruptible(log_wait, (log_start - log_end)); + error = wait_event_interruptible(log_wait, + (log_start - log_end)); if (error) goto out; i = 0; @@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len) error = i; break; case 4: /* Read/clear last kernel messages */ - do_clear = 1; + do_clear = 1; /* FALL THRU */ case 3: /* Read last kernel messages */ error = -EINVAL; @@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len) limit = log_end; /* * __put_user() could sleep, and while we sleep - * printk() could overwrite the messages + * printk() could overwrite the messages * we try to copy to user space. Therefore * the messages are copied in reverse. <manfreds> */ - for(i = 0; i < count && !error; i++) { + for (i = 0; i < count && !error; i++) { j = limit-1-i; if (j + log_buf_len < log_end) break; @@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len) if (error) break; error = i; - if(i != count) { + if (i != count) { int offset = count-error; /* buffer overflow during copy, correct user buffer. */ - for(i=0;i<error;i++) { + for (i = 0; i < error; i++) { if (__get_user(c,&buf[i+offset]) || __put_user(c,&buf[i])) { error = -EFAULT; @@ -351,7 +351,7 @@ out: return error; } -asmlinkage long sys_syslog(int type, char __user * buf, int len) +asmlinkage long sys_syslog(int type, char __user *buf, int len) { return do_syslog(type, buf, len); } @@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end) cur_index = start; start_print = start; while (cur_index != end) { - if ( msg_level < 0 && - ((end - cur_index) > 2) && - LOG_BUF(cur_index + 0) == '<' && - LOG_BUF(cur_index + 1) >= '0' && - LOG_BUF(cur_index + 1) <= '7' && - LOG_BUF(cur_index + 2) == '>') - { + if (msg_level < 0 && ((end - cur_index) > 2) && + LOG_BUF(cur_index + 0) == '<' && + LOG_BUF(cur_index + 1) >= '0' && + LOG_BUF(cur_index + 1) <= '7' && + LOG_BUF(cur_index + 2) == '>') { msg_level = LOG_BUF(cur_index + 1) - '0'; cur_index += 3; start_print = cur_index; } while (cur_index != end) { char c = LOG_BUF(cur_index); - cur_index++; + cur_index++; if (c == '\n') { if (msg_level < 0) { /* @@ -461,7 +459,7 @@ static void zap_locks(void) static unsigned long oops_timestamp; if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30*HZ)) + !time_after(jiffies, oops_timestamp + 30 * HZ)) return; oops_timestamp = jiffies; @@ -493,9 +491,12 @@ __attribute__((weak)) unsigned long long printk_clock(void) return sched_clock(); } -/* +/** + * printk - print a kernel message + * @fmt: format string + * * This is printk. It can be called from any context. We want it to work. - * + * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output * into the log buffer and return. The current holder of the console_sem will @@ -505,6 +506,9 @@ __attribute__((weak)) unsigned long long printk_clock(void) * One effect of this deferred printing is that code which calls printk() and * then changes console_loglevel may break. This is because console_loglevel * is inspected when the actual printing occurs. + * + * See also: + * printf(3) */ asmlinkage int printk(const char *fmt, ...) @@ -639,18 +643,27 @@ EXPORT_SYMBOL(vprintk); #else -asmlinkage long sys_syslog(int type, char __user * buf, int len) +asmlinkage long sys_syslog(int type, char __user *buf, int len) { return 0; } -int do_syslog(int type, char __user * buf, int len) { return 0; } -static void call_console_drivers(unsigned long start, unsigned long end) {} +int do_syslog(int type, char __user *buf, int len) +{ + return 0; +} + +static void call_console_drivers(unsigned long start, unsigned long end) +{ +} #endif /** * add_preferred_console - add a device to the list of preferred consoles. + * @name: device name + * @idx: device index + * @options: options for this console * * The last preferred console added will be used for kernel messages * and stdin/out/err for init. Normally this is used by console_setup @@ -760,7 +773,8 @@ void release_console_sem(void) } EXPORT_SYMBOL(release_console_sem); -/** console_conditional_schedule - yield the CPU if required +/** + * console_conditional_schedule - yield the CPU if required * * If the console code is currently allowed to sleep, and * if this CPU should yield the CPU to another task, do @@ -802,7 +816,6 @@ void console_unblank(void) c->unblank(); release_console_sem(); } -EXPORT_SYMBOL(console_unblank); /* * Return the console tty driver structure and its associated index @@ -851,9 +864,9 @@ EXPORT_SYMBOL(console_start); * print any messages that were printed by the kernel before the * console driver was initialized. */ -void register_console(struct console * console) +void register_console(struct console *console) { - int i; + int i; unsigned long flags; if (preferred_console < 0) @@ -878,7 +891,8 @@ void register_console(struct console * console) * See if this console matches one we selected on * the command line. */ - for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; + i++) { if (strcmp(console_cmdline[i].name, console->name) != 0) continue; if (console->index >= 0 && @@ -933,26 +947,26 @@ void register_console(struct console * console) } EXPORT_SYMBOL(register_console); -int unregister_console(struct console * console) +int unregister_console(struct console *console) { - struct console *a,*b; + struct console *a, *b; int res = 1; acquire_console_sem(); if (console_drivers == console) { console_drivers=console->next; res = 0; - } else { + } else if (console_drivers) { for (a=console_drivers->next, b=console_drivers ; a; b=a, a=b->next) { if (a == console) { b->next = a->next; res = 0; break; - } + } } } - + /* If last console is removed, we re-enable picking the first * one that gets registered. Without that, pmac early boot console * would prevent fbcon from taking over. @@ -972,6 +986,8 @@ EXPORT_SYMBOL(unregister_console); /** * tty_write_message - write a message to a certain tty, not just the console. + * @tty: the destination tty_struct + * @msg: the message to write * * This is used for messages that need to be redirected to a specific tty. * We don't put it into the syslog queue right now maybe in the future if @@ -994,7 +1010,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { static DEFINE_SPINLOCK(ratelimit_lock); - static unsigned long toks = 10*5*HZ; + static unsigned long toks = 10 * 5 * HZ; static unsigned long last_msg; static int missed; unsigned long flags; @@ -1007,6 +1023,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) toks = ratelimit_burst * ratelimit_jiffies; if (toks >= ratelimit_jiffies) { int lost = missed; + missed = 0; toks -= ratelimit_jiffies; spin_unlock_irqrestore(&ratelimit_lock, flags); @@ -1021,7 +1038,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) EXPORT_SYMBOL(__printk_ratelimit); /* minimum time in jiffies between messages */ -int printk_ratelimit_jiffies = 5*HZ; +int printk_ratelimit_jiffies = 5 * HZ; /* number of messages we send before ratelimiting */ int printk_ratelimit_burst = 10; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 019e04ec065a..656476eedb1b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child) signal_wake_up(child, 1); } } + if (child->signal->flags & SIGNAL_GROUP_EXIT) { + sigaddset(&child->pending.signal, SIGKILL); + signal_wake_up(child, 1); + } spin_unlock(&child->sighand->siglock); } @@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child) SET_LINKS(child); } - if (child->state == TASK_TRACED) - ptrace_untrace(child); + ptrace_untrace(child); } /* @@ -152,7 +155,7 @@ int ptrace_attach(struct task_struct *task) retval = -EPERM; if (task->pid <= 1) goto bad; - if (task == current) + if (task->tgid == current->tgid) goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) @@ -238,7 +241,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - set_page_dirty_lock(page); + if (!PageCompound(page)) + set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); @@ -403,3 +407,85 @@ int ptrace_request(struct task_struct *child, long request, return ret; } + +#ifndef __ARCH_SYS_PTRACE +static int ptrace_get_task_struct(long request, long pid, + struct task_struct **childp) +{ + struct task_struct *child; + int ret; + + /* + * Callers use child == NULL as an indication to exit early even + * when the return value is 0, so make sure it is non-NULL here. + */ + *childp = NULL; + + if (request == PTRACE_TRACEME) { + /* + * Are we already being traced? + */ + if (current->ptrace & PT_PTRACED) + return -EPERM; + ret = security_ptrace(current->parent, current); + if (ret) + return -EPERM; + /* + * Set the ptrace bit in the process ptrace flags. + */ + current->ptrace |= PT_PTRACED; + return 0; + } + + /* + * You may not mess with init + */ + if (pid == 1) + return -EPERM; + + ret = -ESRCH; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); + if (!child) + return -ESRCH; + + *childp = child; + return 0; +} + +asmlinkage long sys_ptrace(long request, long pid, long addr, long data) +{ + struct task_struct *child; + long ret; + + /* + * This lock_kernel fixes a subtle race with suid exec + */ + lock_kernel(); + ret = ptrace_get_task_struct(request, pid, &child); + if (!child) + goto out; + + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + goto out_put_task_struct; + } + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out_put_task_struct; + + ret = arch_ptrace(child, request, addr, data); + if (ret < 0) + goto out_put_task_struct; + + out_put_task_struct: + put_task_struct(child); + out: + unlock_kernel(); + return ret; +} +#endif /* __ARCH_SYS_PTRACE */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index bef3b6901b76..c4d159a21e04 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -71,7 +71,7 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; /* Fake initialization required by compiler */ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; -static int maxbatch = 10; +static int maxbatch = 10000; #ifndef __HAVE_ARCH_CMPXCHG /* @@ -109,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head, rdp = &__get_cpu_var(rcu_data); *rdp->nxttail = head; rdp->nxttail = &head->next; + + if (unlikely(++rdp->count > 10000)) + set_need_resched(); + local_irq_restore(flags); } @@ -140,10 +144,25 @@ void fastcall call_rcu_bh(struct rcu_head *head, rdp = &__get_cpu_var(rcu_bh_data); *rdp->nxttail = head; rdp->nxttail = &head->next; + rdp->count++; +/* + * Should we directly call rcu_do_batch() here ? + * if (unlikely(rdp->count > 10000)) + * rcu_do_batch(rdp); + */ local_irq_restore(flags); } /* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed(void) +{ + return rcu_ctrlblk.completed; +} + +/* * Invoke the completed RCU callbacks. They are expected to be in * a per-cpu list. */ @@ -157,6 +176,7 @@ static void rcu_do_batch(struct rcu_data *rdp) next = rdp->donelist = list->next; list->func(list); list = next; + rdp->count--; if (++count >= maxbatch) break; } @@ -490,6 +510,7 @@ void synchronize_kernel(void) } module_param(maxbatch, int, 0); +EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c new file mode 100644 index 000000000000..88c28d476550 --- /dev/null +++ b/kernel/rcutorture.c @@ -0,0 +1,514 @@ +/* + * Read-Copy Update /proc-based torture test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * + * See also: Documentation/RCU/torture.txt + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <asm/atomic.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/rcuref.h> +#include <linux/cpu.h> +#include <linux/random.h> +#include <linux/delay.h> +#include <linux/byteorder/swabb.h> +#include <linux/stat.h> + +MODULE_LICENSE("GPL"); + +static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ +static int stat_interval = 0; /* Interval between stats, in seconds. */ + /* Defaults to "only at end of test". */ +static int verbose = 0; /* Print more debug info. */ + +MODULE_PARM(nreaders, "i"); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +MODULE_PARM(stat_interval, "i"); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +MODULE_PARM(verbose, "i"); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); +#define TORTURE_FLAG "rcutorture: " +#define PRINTK_STRING(s) \ + do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) +#define VERBOSE_PRINTK_STRING(s) \ + do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) +#define VERBOSE_PRINTK_ERRSTRING(s) \ + do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) + +static char printk_buf[4096]; + +static int nrealreaders; +static struct task_struct *writer_task; +static struct task_struct **reader_tasks; +static struct task_struct *stats_task; + +#define RCU_TORTURE_PIPE_LEN 10 + +struct rcu_torture { + struct rcu_head rtort_rcu; + int rtort_pipe_count; + struct list_head rtort_free; + int rtort_mbtest; +}; + +static int fullstop = 0; /* stop generating callbacks at test end. */ +static LIST_HEAD(rcu_torture_freelist); +static struct rcu_torture *rcu_torture_current = NULL; +static long rcu_torture_current_version = 0; +static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; +static DEFINE_SPINLOCK(rcu_torture_lock); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = + { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = + { 0 }; +static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +atomic_t n_rcu_torture_alloc; +atomic_t n_rcu_torture_alloc_fail; +atomic_t n_rcu_torture_free; +atomic_t n_rcu_torture_mberror; +atomic_t n_rcu_torture_error; + +/* + * Allocate an element from the rcu_tortures pool. + */ +struct rcu_torture * +rcu_torture_alloc(void) +{ + struct list_head *p; + + spin_lock(&rcu_torture_lock); + if (list_empty(&rcu_torture_freelist)) { + atomic_inc(&n_rcu_torture_alloc_fail); + spin_unlock(&rcu_torture_lock); + return NULL; + } + atomic_inc(&n_rcu_torture_alloc); + p = rcu_torture_freelist.next; + list_del_init(p); + spin_unlock(&rcu_torture_lock); + return container_of(p, struct rcu_torture, rtort_free); +} + +/* + * Free an element to the rcu_tortures pool. + */ +static void +rcu_torture_free(struct rcu_torture *p) +{ + atomic_inc(&n_rcu_torture_free); + spin_lock(&rcu_torture_lock); + list_add_tail(&p->rtort_free, &rcu_torture_freelist); + spin_unlock(&rcu_torture_lock); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + rcu_torture_free(rp); + } else + call_rcu(p, rcu_torture_cb); +} + +struct rcu_random_state { + unsigned long rrs_state; + unsigned long rrs_count; +}; + +#define RCU_RANDOM_MULT 39916801 /* prime */ +#define RCU_RANDOM_ADD 479001701 /* prime */ +#define RCU_RANDOM_REFRESH 10000 + +#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +static long +rcu_random(struct rcu_random_state *rrsp) +{ + long refresh; + + if (--rrsp->rrs_count < 0) { + get_random_bytes(&refresh, sizeof(refresh)); + rrsp->rrs_state += refresh; + rrsp->rrs_count = RCU_RANDOM_REFRESH; + } + rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; + return swahw32(rrsp->rrs_state); +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + int i; + long oldbatch = rcu_batches_completed(); + struct rcu_torture *rp; + struct rcu_torture *old_rp; + static DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1); + if (rcu_batches_completed() == oldbatch) + continue; + if ((rp = rcu_torture_alloc()) == NULL) + continue; + rp->rtort_pipe_count = 0; + udelay(rcu_random(&rand) & 0x3ff); + old_rp = rcu_torture_current; + rp->rtort_mbtest = 1; + rcu_assign_pointer(rcu_torture_current, rp); + smp_wmb(); + if (old_rp != NULL) { + i = old_rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + old_rp->rtort_pipe_count++; + call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); + } + rcu_torture_current_version++; + oldbatch = rcu_batches_completed(); + } while (!kthread_should_stop() && !fullstop); + VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static int +rcu_torture_reader(void *arg) +{ + int completed; + DEFINE_RCU_RANDOM(rand); + struct rcu_torture *p; + int pipe_count; + + VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); + set_user_nice(current, 19); + + do { + rcu_read_lock(); + completed = rcu_batches_completed(); + p = rcu_dereference(rcu_torture_current); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + rcu_read_unlock(); + schedule_timeout_interruptible(HZ); + continue; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + udelay(rcu_random(&rand) & 0x7f); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_count)[pipe_count]; + completed = rcu_batches_completed() - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_batch)[completed]; + preempt_enable(); + rcu_read_unlock(); + schedule(); + } while (!kthread_should_stop() && !fullstop); + VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * Create an RCU-torture statistics message in the specified buffer. + */ +static int +rcu_torture_printk(char *page) +{ + int cnt = 0; + int cpu; + int i; + long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + + for_each_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; + batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + } + } + for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + if (pipesummary[i] != 0) + break; + } + cnt += sprintf(&page[cnt], "rcutorture: "); + cnt += sprintf(&page[cnt], + "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " + "rtmbe: %d", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free), + atomic_read(&n_rcu_torture_mberror)); + if (atomic_read(&n_rcu_torture_mberror) != 0) + cnt += sprintf(&page[cnt], " !!!"); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + if (i > 1) { + cnt += sprintf(&page[cnt], "!!! "); + atomic_inc(&n_rcu_torture_error); + } + cnt += sprintf(&page[cnt], "Reader Pipe: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "Reader Batch: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "Free-Block Circulation: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + cnt += sprintf(&page[cnt], " %d", + atomic_read(&rcu_torture_wcount[i])); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} + +/* + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). + */ +static void +rcu_torture_stats_print(void) +{ + int cnt; + + cnt = rcu_torture_printk(printk_buf); + printk(KERN_ALERT "%s", printk_buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int +rcu_torture_stats(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + rcu_torture_stats_print(); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); + return 0; +} + +static void +rcu_torture_cleanup(void) +{ + int i; + + fullstop = 1; + if (writer_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + kthread_stop(writer_task); + } + writer_task = NULL; + + if (reader_tasks != NULL) { + for (i = 0; i < nrealreaders; i++) { + if (reader_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_reader task"); + kthread_stop(reader_tasks[i]); + } + reader_tasks[i] = NULL; + } + kfree(reader_tasks); + reader_tasks = NULL; + } + rcu_torture_current = NULL; + + if (stats_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + kthread_stop(stats_task); + } + stats_task = NULL; + + /* Wait for all RCU callbacks to fire. */ + + for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + synchronize_rcu(); + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + printk(KERN_ALERT TORTURE_FLAG + "--- End of test: %s\n", + atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); +} + +static int +rcu_torture_init(void) +{ + int i; + int cpu; + int firsterr = 0; + + /* Process args and tell the world that the torturer is on the job. */ + + if (nreaders >= 0) + nrealreaders = nreaders; + else + nrealreaders = 2 * num_online_cpus(); + printk(KERN_ALERT TORTURE_FLAG + "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", + nrealreaders, stat_interval, verbose); + fullstop = 0; + + /* Set up the freelist. */ + + INIT_LIST_HEAD(&rcu_torture_freelist); + for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { + rcu_tortures[i].rtort_mbtest = 0; + list_add_tail(&rcu_tortures[i].rtort_free, + &rcu_torture_freelist); + } + + /* Initialize the statistics so that each run gets its own numbers. */ + + rcu_torture_current = NULL; + rcu_torture_current_version = 0; + atomic_set(&n_rcu_torture_alloc, 0); + atomic_set(&n_rcu_torture_alloc_fail, 0); + atomic_set(&n_rcu_torture_free, 0); + atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_error, 0); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + atomic_set(&rcu_torture_wcount[i], 0); + for_each_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + per_cpu(rcu_torture_count, cpu)[i] = 0; + per_cpu(rcu_torture_batch, cpu)[i] = 0; + } + } + + /* Start up the kthreads. */ + + VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); + writer_task = kthread_run(rcu_torture_writer, NULL, + "rcu_torture_writer"); + if (IS_ERR(writer_task)) { + firsterr = PTR_ERR(writer_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); + writer_task = NULL; + goto unwind; + } + reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); + reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, + "rcu_torture_reader"); + if (IS_ERR(reader_tasks[i])) { + firsterr = PTR_ERR(reader_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); + reader_tasks[i] = NULL; + goto unwind; + } + } + if (stat_interval > 0) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); + stats_task = kthread_run(rcu_torture_stats, NULL, + "rcu_torture_stats"); + if (IS_ERR(stats_task)) { + firsterr = PTR_ERR(stats_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); + stats_task = NULL; + goto unwind; + } + } + return 0; + +unwind: + rcu_torture_cleanup(); + return firsterr; +} + +module_init(rcu_torture_init); +module_exit(rcu_torture_cleanup); diff --git a/kernel/sched.c b/kernel/sched.c index 1f31a528fdba..6f46c94cc29e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -206,6 +206,7 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP + unsigned long prio_bias; unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -659,13 +660,68 @@ static int effective_prio(task_t *p) return prio; } +#ifdef CONFIG_SMP +static inline void inc_prio_bias(runqueue_t *rq, int prio) +{ + rq->prio_bias += MAX_PRIO - prio; +} + +static inline void dec_prio_bias(runqueue_t *rq, int prio) +{ + rq->prio_bias -= MAX_PRIO - prio; +} + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; + if (rt_task(p)) { + if (p != rq->migration_thread) + /* + * The migration thread does the actual balancing. Do + * not bias by its priority as the ultra high priority + * will skew balancing adversely. + */ + inc_prio_bias(rq, p->prio); + } else + inc_prio_bias(rq, p->static_prio); +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; + if (rt_task(p)) { + if (p != rq->migration_thread) + dec_prio_bias(rq, p->prio); + } else + dec_prio_bias(rq, p->static_prio); +} +#else +static inline void inc_prio_bias(runqueue_t *rq, int prio) +{ +} + +static inline void dec_prio_bias(runqueue_t *rq, int prio) +{ +} + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; +} +#endif + /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - rq->nr_running++; + inc_nr_running(p, rq); } /* @@ -674,7 +730,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); - rq->nr_running++; + inc_nr_running(p, rq); } static int recalc_task_prio(task_t *p, unsigned long long now) @@ -759,7 +815,8 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) } #endif - p->prio = recalc_task_prio(p, now); + if (!rt_task(p)) + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -793,7 +850,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - rq->nr_running--; + dec_nr_running(p, rq); dequeue_task(p, p->array); p->array = NULL; } @@ -808,21 +865,28 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) #ifdef CONFIG_SMP static void resched_task(task_t *p) { - int need_resched, nrpolling; + int cpu; assert_spin_locked(&task_rq(p)->lock); - /* minimise the chance of sending an interrupt to poll_idle() */ - nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); - need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); - nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; - if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) - smp_send_reschedule(task_cpu(p)); + /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ + smp_mb(); + if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) + smp_send_reschedule(cpu); } #else static inline void resched_task(task_t *p) { + assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } #endif @@ -930,27 +994,61 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu, int type) +static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + unsigned long running = rq->nr_running; + unsigned long source_load, cpu_load = rq->cpu_load[type-1], + load_now = running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + source_load = load_now; + else + source_load = min(cpu_load, load_now); + + if (running > 1 || (idle == NOT_IDLE && running)) + /* + * If we are busy rebalancing the load is biased by + * priority to create 'nice' support across cpus. When + * idle rebalancing we should only bias the source_load if + * there is more than one task running on that queue to + * prevent idle rebalance from trying to pull tasks from a + * queue with only one running task. + */ + source_load = source_load * rq->prio_bias / running; + + return source_load; +} - return min(rq->cpu_load[type-1], load_now); +static inline unsigned long source_load(int cpu, int type) +{ + return __source_load(cpu, type, NOT_IDLE); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long target_load(int cpu, int type) +static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + unsigned long running = rq->nr_running; + unsigned long target_load, cpu_load = rq->cpu_load[type-1], + load_now = running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + target_load = load_now; + else + target_load = max(cpu_load, load_now); + + if (running > 1 || (idle == NOT_IDLE && running)) + target_load = target_load * rq->prio_bias / running; + + return target_load; +} - return max(rq->cpu_load[type-1], load_now); +static inline unsigned long target_load(int cpu, int type) +{ + return __target_load(cpu, type, NOT_IDLE); } /* @@ -1339,7 +1437,7 @@ void fastcall sched_fork(task_t *p, int clone_flags) #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ - p->thread_info->preempt_count = 1; + task_thread_info(p)->preempt_count = 1; #endif /* * Share the timeslice between parent and child, thus the @@ -1411,7 +1509,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - rq->nr_running++; + inc_nr_running(p, rq); } set_need_resched(); } else @@ -1468,7 +1566,7 @@ void fastcall sched_exit(task_t *p) * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { + if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { p->parent->time_slice += p->time_slice; if (unlikely(p->parent->time_slice > task_timeslice(p))) p->parent->time_slice = task_timeslice(p); @@ -1756,9 +1854,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - src_rq->nr_running--; + dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); - this_rq->nr_running++; + inc_nr_running(p, this_rq); enqueue_task(p, this_array); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; @@ -1937,9 +2035,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i, load_idx); + load = __target_load(i, load_idx, idle); else - load = source_load(i, load_idx); + load = __source_load(i, load_idx, idle); avg_load += load; } @@ -2044,14 +2142,15 @@ out_balanced: /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static runqueue_t *find_busiest_queue(struct sched_group *group) +static runqueue_t *find_busiest_queue(struct sched_group *group, + enum idle_type idle) { unsigned long load, max_load = 0; runqueue_t *busiest = NULL; int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i, 0); + load = __source_load(i, 0, idle); if (load > max_load) { max_load = load; @@ -2095,7 +2194,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - busiest = find_busiest_queue(group); + busiest = find_busiest_queue(group, idle); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2218,7 +2317,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - busiest = find_busiest_queue(group); + busiest = find_busiest_queue(group, NEWLY_IDLE); if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; @@ -2511,8 +2610,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->idle = cputime64_add(cpustat->idle, tmp); /* Account for system time used */ acct_update_integrals(p); - /* Update rss highwater mark */ - update_mem_hiwater(p); } /* @@ -3453,8 +3550,10 @@ void set_user_nice(task_t *p, long nice) goto out_unlock; } array = p->array; - if (array) + if (array) { dequeue_task(p, array); + dec_prio_bias(rq, p->static_prio); + } old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); @@ -3464,6 +3563,7 @@ void set_user_nice(task_t *p, long nice) if (array) { enqueue_task(p, array); + inc_prio_bias(rq, p->static_prio); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3565,8 +3665,6 @@ int idle_cpu(int cpu) return cpu_curr(cpu) == cpu_rq(cpu)->idle; } -EXPORT_SYMBOL_GPL(idle_cpu); - /** * idle_task - return the idle task for a given cpu. * @cpu: the processor in question. @@ -4229,10 +4327,10 @@ static void show_task(task_t *p) #endif #ifdef CONFIG_DEBUG_STACK_USAGE { - unsigned long *n = (unsigned long *) (p->thread_info+1); + unsigned long *n = end_of_stack(p); while (!*n) n++; - free = (unsigned long) n - (unsigned long)(p->thread_info+1); + free = (unsigned long)n - (unsigned long)end_of_stack(p); } #endif printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); @@ -4312,9 +4410,9 @@ void __devinit init_idle(task_t *idle, int cpu) /* Set the preempt count _outside_ the spinlocks! */ #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) - idle->thread_info->preempt_count = (idle->lock_depth >= 0); + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); #else - idle->thread_info->preempt_count = 0; + task_thread_info(idle)->preempt_count = 0; #endif } @@ -4682,7 +4780,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: /* Unbind it from offline cpu so it can run. Fall thru. */ - kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); + kthread_bind(cpu_rq(cpu)->migration_thread, + any_online_cpu(cpu_online_map)); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; diff --git a/kernel/signal.c b/kernel/signal.c index b92c3c9f8b9a..d7611f189ef7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask) return sig; } -static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags, +static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; @@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __n } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->lock = NULL; q->user = get_uid(t->user); } return(q); @@ -397,20 +396,8 @@ void __exit_signal(struct task_struct *tsk) flush_sigqueue(&tsk->pending); if (sig) { /* - * We are cleaning up the signal_struct here. We delayed - * calling exit_itimers until after flush_sigqueue, just in - * case our thread-local pending queue contained a queued - * timer signal that would have been cleared in - * exit_itimers. When that called sigqueue_free, it would - * attempt to re-take the tasklist_lock and deadlock. This - * can never happen if we ensure that all queues the - * timer's signal might be queued on have been flushed - * first. The shared_pending queue, and our own pending - * queue are the only queues the timer could be on, since - * there are no other threads left in the group and timer - * signals are constrained to threads inside the group. + * We are cleaning up the signal_struct here. */ - exit_itimers(sig); exit_thread_group_keys(sig); kmem_cache_free(signal_cachep, sig); } @@ -418,6 +405,8 @@ void __exit_signal(struct task_struct *tsk) void exit_signal(struct task_struct *tsk) { + atomic_dec(&tsk->signal->live); + write_lock_irq(&tasklist_lock); __exit_signal(tsk); write_unlock_irq(&tasklist_lock); @@ -524,16 +513,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = 0; - /* SIGKILL must have priority, otherwise it is quite easy - * to create an unkillable process, sending sig < SIGKILL - * to self */ - if (unlikely(sigismember(&pending->signal, SIGKILL))) { - if (!sigismember(mask, SIGKILL)) - sig = SIGKILL; - } - - if (likely(!sig)) - sig = next_signal(pending, mask); + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { @@ -578,7 +558,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) + tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } if ( signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && @@ -661,8 +642,7 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return error; error = -EPERM; - if ((!info || ((unsigned long)info != 1 && - (unsigned long)info != 2 && SI_FROMUSER(info))) + if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || (current->signal->session != t->signal->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) @@ -799,7 +779,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. */ - if ((unsigned long)info == 2) + if (info == SEND_SIG_FORCED) goto out_set; /* Real-time signals must be queued if sent by sigqueue, or @@ -811,19 +791,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, pass on the info struct. */ q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - ((unsigned long) info < 2 || + (is_si_special(info) || info->si_code >= 0))); if (q) { list_add_tail(&q->list, &signals->list); switch ((unsigned long) info) { - case 0: + case (unsigned long) SEND_SIG_NOINFO: q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = current->pid; q->info.si_uid = current->uid; break; - case 1: + case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; @@ -834,20 +814,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, copy_siginfo(&q->info, info); break; } - } else { - if (sig >= SIGRTMIN && info && (unsigned long)info != 1 - && info->si_code != SI_USER) + } else if (!is_si_special(info)) { + if (sig >= SIGRTMIN && info->si_code != SI_USER) /* * Queue overflow, abort. We may abort if the signal was rt * and sent by user using something other than kill(). */ return -EAGAIN; - if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped - * the signal. - */ - ret = info->si_sys_private; } out_set: @@ -868,12 +841,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) BUG(); assert_spin_locked(&t->sighand->siglock); - if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped the signal. - */ - ret = info->si_sys_private; - /* Short-circuit ignored signals. */ if (sig_ignored(t, sig)) goto out; @@ -903,11 +870,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) int ret; spin_lock_irqsave(&t->sighand->siglock, flags); - if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { + if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; + } + if (sigismember(&t->blocked, sig)) { sigdelset(&t->blocked, sig); - recalc_sigpending_tsk(t); } + recalc_sigpending_tsk(t); ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); @@ -917,15 +886,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) void force_sig_specific(int sig, struct task_struct *t) { - unsigned long int flags; - - spin_lock_irqsave(&t->sighand->siglock, flags); - if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) - t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; - sigdelset(&t->blocked, sig); - recalc_sigpending_tsk(t); - specific_send_sig_info(sig, (void *)2, t); - spin_unlock_irqrestore(&t->sighand->siglock, flags); + force_sig_info(sig, SEND_SIG_FORCED, t); } /* @@ -936,34 +897,31 @@ force_sig_specific(int sig, struct task_struct *t) * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ -#define wants_signal(sig, p, mask) \ - (!sigismember(&(p)->blocked, sig) \ - && !((p)->state & mask) \ - && !((p)->flags & PF_EXITING) \ - && (task_curr(p) || !signal_pending(p))) - +static inline int wants_signal(int sig, struct task_struct *p) +{ + if (sigismember(&p->blocked, sig)) + return 0; + if (p->flags & PF_EXITING) + return 0; + if (sig == SIGKILL) + return 1; + if (p->state & (TASK_STOPPED | TASK_TRACED)) + return 0; + return task_curr(p) || !signal_pending(p); +} static void __group_complete_signal(int sig, struct task_struct *p) { - unsigned int mask; struct task_struct *t; /* - * Don't bother traced and stopped tasks (but - * SIGKILL will punch through that). - */ - mask = TASK_STOPPED | TASK_TRACED; - if (sig == SIGKILL) - mask = 0; - - /* * Now find a thread we can wake up to take the signal off the queue. * * If the main thread wants the signal, it gets first crack. * Probably the least surprising to the average bear. */ - if (wants_signal(sig, p, mask)) + if (wants_signal(sig, p)) t = p; else if (thread_group_empty(p)) /* @@ -981,7 +939,7 @@ __group_complete_signal(int sig, struct task_struct *p) t = p->signal->curr_target = p; BUG_ON(t->tgid != p->tgid); - while (!wants_signal(sig, t, mask)) { + while (!wants_signal(sig, t)) { t = next_thread(t); if (t == p->signal->curr_target) /* @@ -1063,12 +1021,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) assert_spin_locked(&p->sighand->siglock); handle_stop_signal(sig, p); - if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped the signal. - */ - ret = info->si_sys_private; - /* Short-circuit ignored signals. */ if (sig_ignored(p, sig)) return ret; @@ -1121,8 +1073,8 @@ void zap_other_threads(struct task_struct *p) if (t != p->group_leader) t->exit_signal = -1; + /* SIGKILL will be handled before any pending SIGSTOP */ sigaddset(&t->pending.signal, SIGKILL); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); signal_wake_up(t, 1); } } @@ -1195,6 +1147,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } +/* like kill_proc_info(), but doesn't use uid/euid of "current" */ +int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, + uid_t uid, uid_t euid) +{ + int ret = -EINVAL; + struct task_struct *p; + + if (!valid_signal(sig)) + return ret; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) { + ret = -ESRCH; + goto out_unlock; + } + if ((!info || ((unsigned long)info != 1 && + (unsigned long)info != 2 && SI_FROMUSER(info))) + && (euid != p->suid) && (euid != p->uid) + && (uid != p->suid) && (uid != p->uid)) { + ret = -EPERM; + goto out_unlock; + } + if (sig && p->sighand) { + unsigned long flags; + spin_lock_irqsave(&p->sighand->siglock, flags); + ret = __group_send_sig_info(sig, info, p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +out_unlock: + read_unlock(&tasklist_lock); + return ret; +} +EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1264,10 +1250,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) return ret; } +#define __si_special(priv) \ + ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) + int send_sig(int sig, struct task_struct *p, int priv) { - return send_sig_info(sig, (void*)(long)(priv != 0), p); + return send_sig_info(sig, __si_special(priv), p); } /* @@ -1287,7 +1276,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) void force_sig(int sig, struct task_struct *p) { - force_sig_info(sig, (void*)1L, p); + force_sig_info(sig, SEND_SIG_PRIV, p); } /* @@ -1312,13 +1301,13 @@ force_sigsegv(int sig, struct task_struct *p) int kill_pg(pid_t pgrp, int sig, int priv) { - return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); + return kill_pg_info(sig, __si_special(priv), pgrp); } int kill_proc(pid_t pid, int sig, int priv) { - return kill_proc_info(sig, (void *)(long)(priv != 0), pid); + return kill_proc_info(sig, __si_special(priv), pid); } /* @@ -1349,11 +1338,12 @@ void sigqueue_free(struct sigqueue *q) * pending queue. */ if (unlikely(!list_empty(&q->list))) { - read_lock(&tasklist_lock); - spin_lock_irqsave(q->lock, flags); + spinlock_t *lock = ¤t->sighand->siglock; + read_lock(&tasklist_lock); + spin_lock_irqsave(lock, flags); if (!list_empty(&q->list)) list_del_init(&q->list); - spin_unlock_irqrestore(q->lock, flags); + spin_unlock_irqrestore(lock, flags); read_unlock(&tasklist_lock); } q->flags &= ~SIGQUEUE_PREALLOC; @@ -1392,7 +1382,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) goto out; } - q->lock = &p->sighand->siglock; list_add_tail(&q->list, &p->pending.list); sigaddset(&p->pending.signal, sig); if (!sigismember(&p->blocked, sig)) @@ -1440,7 +1429,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) * We always use the shared queue for process-wide signals, * to avoid several races. */ - q->lock = &p->sighand->siglock; list_add_tail(&q->list, &p->signal->shared_pending.list); sigaddset(&p->signal->shared_pending.signal, sig); @@ -1502,7 +1490,7 @@ void do_notify_parent(struct task_struct *tsk, int sig) psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); - if (sig == SIGCHLD && + if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* @@ -1766,7 +1754,8 @@ do_signal_stop(int signr) * stop is always done with the siglock held, * so this check has no races. */ - if (t->state < TASK_STOPPED) { + if (!t->exit_state && + !(t->state & (TASK_STOPPED|TASK_TRACED))) { stop_count++; signal_wake_up(t, 0); } @@ -1858,9 +1847,9 @@ relock: /* Let the debugger run. */ ptrace_stop(signr, signr, info); - /* We're back. Did the debugger cancel the sig? */ + /* We're back. Did the debugger cancel the sig or group_exit? */ signr = current->exit_code; - if (signr == 0) + if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) continue; current->exit_code = 0; @@ -2262,26 +2251,13 @@ sys_kill(int pid, int sig) return kill_something_info(sig, &info, pid); } -/** - * sys_tgkill - send signal to one specific thread - * @tgid: the thread group ID of the thread - * @pid: the PID of the thread - * @sig: signal to be sent - * - * This syscall also checks the tgid and returns -ESRCH even if the PID - * exists but it's not belonging to the target process anymore. This - * method solves the problem of threads exiting and PIDs getting reused. - */ -asmlinkage long sys_tgkill(int tgid, int pid, int sig) +static int do_tkill(int tgid, int pid, int sig) { - struct siginfo info; int error; + struct siginfo info; struct task_struct *p; - /* This is only valid for single tasks */ - if (pid <= 0 || tgid <= 0) - return -EINVAL; - + error = -ESRCH; info.si_signo = sig; info.si_errno = 0; info.si_code = SI_TKILL; @@ -2290,8 +2266,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) read_lock(&tasklist_lock); p = find_task_by_pid(pid); - error = -ESRCH; - if (p && (p->tgid == tgid)) { + if (p && (tgid <= 0 || p->tgid == tgid)) { error = check_kill_permission(sig, &info, p); /* * The null signal is a permissions and process existence @@ -2305,47 +2280,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) } } read_unlock(&tasklist_lock); + return error; } +/** + * sys_tgkill - send signal to one specific thread + * @tgid: the thread group ID of the thread + * @pid: the PID of the thread + * @sig: signal to be sent + * + * This syscall also checks the tgid and returns -ESRCH even if the PID + * exists but it's not belonging to the target process anymore. This + * method solves the problem of threads exiting and PIDs getting reused. + */ +asmlinkage long sys_tgkill(int tgid, int pid, int sig) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + return do_tkill(tgid, pid, sig); +} + /* * Send a signal to only one task, even if it's a CLONE_THREAD task. */ asmlinkage long sys_tkill(int pid, int sig) { - struct siginfo info; - int error; - struct task_struct *p; - /* This is only valid for single tasks */ if (pid <= 0) return -EINVAL; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = current->tgid; - info.si_uid = current->uid; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - error = -ESRCH; - if (p) { - error = check_kill_permission(sig, &info, p); - /* - * The null signal is a permissions and process existence - * probe. No signal is actually delivered. - */ - if (!error && sig && p->sighand) { - spin_lock_irq(&p->sighand->siglock); - handle_stop_signal(sig, p); - error = specific_send_sig_info(sig, &info, p); - spin_unlock_irq(&p->sighand->siglock); - } - } - read_unlock(&tasklist_lock); - return error; + return do_tkill(0, pid, sig); } asmlinkage long diff --git a/kernel/softirq.c b/kernel/softirq.c index f766b2fc48be..ad3295cdded5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -470,7 +470,8 @@ static int __devinit cpu_callback(struct notifier_block *nfb, #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id()); + kthread_bind(per_cpu(ksoftirqd, hotcpu), + any_online_cpu(cpu_online_map)); case CPU_DEAD: p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 75976209cea7..c67189a25d52 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -73,9 +73,6 @@ void softlockup_tick(struct pt_regs *regs) static int watchdog(void * __bind_cpu) { struct sched_param param = { .sched_priority = 99 }; - int this_cpu = (long) __bind_cpu; - - printk("softlockup thread %d started up.\n", this_cpu); sched_setscheduler(current, SCHED_FIFO, ¶m); current->flags |= PF_NOFREEZE; @@ -123,7 +120,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); + kthread_bind(per_cpu(watchdog_task, hotcpu), + any_online_cpu(cpu_online_map)); case CPU_DEAD: p = per_cpu(watchdog_task, hotcpu); per_cpu(watchdog_task, hotcpu) = NULL; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84a9d18aa8da..b3d4dc858e35 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -119,13 +119,12 @@ static int stop_machine(void) return ret; } - /* Don't schedule us away at this point, please. */ - local_irq_disable(); - /* Now they are all started, make them hold the CPUs, ready. */ + preempt_disable(); stopmachine_set_state(STOPMACHINE_PREPARE); /* Make them disable irqs. */ + local_irq_disable(); stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); return 0; @@ -135,6 +134,7 @@ static void restart_machine(void) { stopmachine_set_state(STOPMACHINE_EXIT); local_irq_enable(); + preempt_enable_no_resched(); } struct stop_machine_data diff --git a/kernel/sys.c b/kernel/sys.c index f723522e6986..bce933ebb29f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -28,6 +28,7 @@ #include <linux/suspend.h> #include <linux/tty.h> #include <linux/signal.h> +#include <linux/cn_proc.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -361,17 +362,38 @@ out_unlock: return retval; } +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ void emergency_restart(void) { machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); -void kernel_restart(char *cmd) +void kernel_restart_prepare(char *cmd) { notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; device_shutdown(); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); if (!cmd) { printk(KERN_EMERG "Restarting system.\n"); } else { @@ -382,6 +404,12 @@ void kernel_restart(char *cmd) } EXPORT_SYMBOL_GPL(kernel_restart); +/** + * kernel_kexec - reboot the system + * + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ void kernel_kexec(void) { #ifdef CONFIG_KEXEC @@ -390,9 +418,7 @@ void kernel_kexec(void) if (!image) { return; } - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_shutdown(); + kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); machine_kexec(image); @@ -400,21 +426,39 @@ void kernel_kexec(void) } EXPORT_SYMBOL_GPL(kernel_kexec); -void kernel_halt(void) +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt_prepare(void) { notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); system_state = SYSTEM_HALT; device_shutdown(); +} +void kernel_halt(void) +{ + kernel_halt_prepare(); printk(KERN_EMERG "System halted.\n"); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); -void kernel_power_off(void) +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off_prepare(void) { notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); system_state = SYSTEM_POWER_OFF; device_shutdown(); +} +void kernel_power_off(void) +{ + kernel_power_off_prepare(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); } @@ -583,6 +627,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) current->egid = new_egid; current->gid = new_rgid; key_fsgid_changed(current); + proc_id_connector(current, PROC_EVENT_GID); return 0; } @@ -622,6 +667,7 @@ asmlinkage long sys_setgid(gid_t gid) return -EPERM; key_fsgid_changed(current); + proc_id_connector(current, PROC_EVENT_GID); return 0; } @@ -711,6 +757,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) current->fsuid = current->euid; key_fsuid_changed(current); + proc_id_connector(current, PROC_EVENT_UID); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); } @@ -758,6 +805,7 @@ asmlinkage long sys_setuid(uid_t uid) current->suid = new_suid; key_fsuid_changed(current); + proc_id_connector(current, PROC_EVENT_UID); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); } @@ -806,6 +854,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) current->suid = suid; key_fsuid_changed(current); + proc_id_connector(current, PROC_EVENT_UID); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); } @@ -858,6 +907,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) current->sgid = sgid; key_fsgid_changed(current); + proc_id_connector(current, PROC_EVENT_GID); return 0; } @@ -900,6 +950,7 @@ asmlinkage long sys_setfsuid(uid_t uid) } key_fsuid_changed(current); + proc_id_connector(current, PROC_EVENT_UID); security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); @@ -928,6 +979,7 @@ asmlinkage long sys_setfsgid(gid_t gid) } current->fsgid = gid; key_fsgid_changed(current); + proc_id_connector(current, PROC_EVENT_GID); } return old_fsgid; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8e56e2495542..9990e10192e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -169,7 +169,7 @@ struct file_operations proc_sys_file_operations = { extern struct proc_dir_entry *proc_sys_root; -static void register_proc_table(ctl_table *, struct proc_dir_entry *); +static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *); static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); #endif @@ -952,7 +952,7 @@ static ctl_table fs_table[] = { .data = &aio_nr, .maxlen = sizeof(aio_nr), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_doulongvec_minmax, }, { .ctl_name = FS_AIO_MAX_NR, @@ -960,7 +960,7 @@ static ctl_table fs_table[] = { .data = &aio_max_nr, .maxlen = sizeof(aio_max_nr), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_doulongvec_minmax, }, #ifdef CONFIG_INOTIFY { @@ -992,10 +992,51 @@ static ctl_table dev_table[] = { extern void init_irq_proc (void); +static DEFINE_SPINLOCK(sysctl_lock); + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + list_del_init(&p->ctl_entry); +} + void __init sysctl_init(void) { #ifdef CONFIG_PROC_FS - register_proc_table(root_table, proc_sys_root); + register_proc_table(root_table, proc_sys_root, &root_table_header); init_irq_proc(); #endif } @@ -1004,6 +1045,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol void __user *newval, size_t newlen) { struct list_head *tmp; + int error = -ENOTDIR; if (nlen <= 0 || nlen >= CTL_MAXNAME) return -ENOTDIR; @@ -1012,20 +1054,30 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol if (!oldlenp || get_user(old_len, oldlenp)) return -EFAULT; } + spin_lock(&sysctl_lock); tmp = &root_table_header.ctl_entry; do { struct ctl_table_header *head = list_entry(tmp, struct ctl_table_header, ctl_entry); void *context = NULL; - int error = parse_table(name, nlen, oldval, oldlenp, + + if (!use_table(head)) + continue; + + spin_unlock(&sysctl_lock); + + error = parse_table(name, nlen, oldval, oldlenp, newval, newlen, head->ctl_table, &context); kfree(context); + + spin_lock(&sysctl_lock); + unuse_table(head); if (error != -ENOTDIR) - return error; - tmp = tmp->next; - } while (tmp != &root_table_header.ctl_entry); - return -ENOTDIR; + break; + } while ((tmp = tmp->next) != &root_table_header.ctl_entry); + spin_unlock(&sysctl_lock); + return error; } asmlinkage long sys_sysctl(struct __sysctl_args __user *args) @@ -1236,12 +1288,16 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, return NULL; tmp->ctl_table = table; INIT_LIST_HEAD(&tmp->ctl_entry); + tmp->used = 0; + tmp->unregistering = NULL; + spin_lock(&sysctl_lock); if (insert_at_head) list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); else list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); + spin_unlock(&sysctl_lock); #ifdef CONFIG_PROC_FS - register_proc_table(table, proc_sys_root); + register_proc_table(table, proc_sys_root, tmp); #endif return tmp; } @@ -1255,10 +1311,13 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, */ void unregister_sysctl_table(struct ctl_table_header * header) { - list_del(&header->ctl_entry); + might_sleep(); + spin_lock(&sysctl_lock); + start_unregistering(header); #ifdef CONFIG_PROC_FS unregister_proc_table(header->ctl_table, proc_sys_root); #endif + spin_unlock(&sysctl_lock); kfree(header); } @@ -1269,7 +1328,7 @@ void unregister_sysctl_table(struct ctl_table_header * header) #ifdef CONFIG_PROC_FS /* Scan the sysctl entries in table and add them all into /proc */ -static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) +static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) { struct proc_dir_entry *de; int len; @@ -1305,13 +1364,14 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) de = create_proc_entry(table->procname, mode, root); if (!de) continue; + de->set = set; de->data = (void *) table; if (table->proc_handler) de->proc_fops = &proc_sys_file_operations; } table->de = de; if (de->mode & S_IFDIR) - register_proc_table(table->child, de); + register_proc_table(table->child, de, set); } } @@ -1336,6 +1396,13 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root continue; } + /* + * In any case, mark the entry as goner; we'll keep it + * around if it's busy, but we'll know to do nothing with + * its fields. We are under sysctl_lock here. + */ + de->data = NULL; + /* Don't unregister proc entries that are still being used.. */ if (atomic_read(&de->count)) continue; @@ -1349,27 +1416,38 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, size_t count, loff_t *ppos) { int op; - struct proc_dir_entry *de; + struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); struct ctl_table *table; size_t res; - ssize_t error; - - de = PDE(file->f_dentry->d_inode); - if (!de || !de->data) - return -ENOTDIR; - table = (struct ctl_table *) de->data; - if (!table || !table->proc_handler) - return -ENOTDIR; - op = (write ? 002 : 004); - if (ctl_perm(table, op)) - return -EPERM; + ssize_t error = -ENOTDIR; - res = count; - - error = (*table->proc_handler) (table, write, file, buf, &res, ppos); - if (error) - return error; - return res; + spin_lock(&sysctl_lock); + if (de && de->data && use_table(de->set)) { + /* + * at that point we know that sysctl was not unregistered + * and won't be until we finish + */ + spin_unlock(&sysctl_lock); + table = (struct ctl_table *) de->data; + if (!table || !table->proc_handler) + goto out; + error = -EPERM; + op = (write ? 002 : 004); + if (ctl_perm(table, op)) + goto out; + + /* careful: calling conventions are nasty here */ + res = count; + error = (*table->proc_handler)(table, write, file, + buf, &res, ppos); + if (!error) + error = res; + out: + spin_lock(&sysctl_lock); + unuse_table(de->set); + } + spin_unlock(&sysctl_lock); + return error; } static int proc_opensys(struct inode *inode, struct file *file) @@ -1997,6 +2075,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer + * @ppos: pointer to the file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. diff --git a/kernel/time.c b/kernel/time.c index dd5ae1162a8f..245d595a13cb 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc) if (mtemp >= MINSEC) { ltemp = (time_offset / mtemp) << (SHIFT_USEC - SHIFT_UPDATE); - if (ltemp < 0) - time_freq -= -ltemp >> SHIFT_KH; - else - time_freq += ltemp >> SHIFT_KH; + time_freq += shift_right(ltemp, SHIFT_KH); } else /* calibration interval too short (p. 12) */ result = TIME_ERROR; } else { /* PLL mode */ if (mtemp < MAXSEC) { ltemp *= mtemp; - if (ltemp < 0) - time_freq -= -ltemp >> (time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC); - else - time_freq += ltemp >> (time_constant + + time_freq += shift_right(ltemp,(time_constant + time_constant + - SHIFT_KF - SHIFT_USEC); + SHIFT_KF - SHIFT_USEC)); } else /* calibration interval too long (p. 12) */ result = TIME_ERROR; } - if (time_freq > time_tolerance) - time_freq = time_tolerance; - else if (time_freq < -time_tolerance) - time_freq = -time_tolerance; + time_freq = min(time_freq, time_tolerance); + time_freq = max(time_freq, -time_tolerance); } /* STA_PLL || STA_PPSTIME */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) { @@ -384,10 +374,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; else { - if (time_offset < 0) - txc->offset = -(-time_offset >> SHIFT_UPDATE); - else - txc->offset = time_offset >> SHIFT_UPDATE; + txc->offset = shift_right(time_offset, SHIFT_UPDATE); } txc->freq = time_freq + pps_freq; txc->maxerror = time_maxerror; @@ -532,6 +519,7 @@ int do_settimeofday (struct timespec *tv) clock_was_set(); return 0; } +EXPORT_SYMBOL(do_settimeofday); void do_gettimeofday (struct timeval *tv) { @@ -570,6 +558,7 @@ void getnstimeofday(struct timespec *tv) tv->tv_sec = x.tv_sec; tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; } +EXPORT_SYMBOL_GPL(getnstimeofday); #endif #if (BITS_PER_LONG < 64) diff --git a/kernel/timer.c b/kernel/timer.c index 3ba10fa35b60..fd74268d8663 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec); #define time_interpolator_update(x) #endif +u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + /* * per-CPU timer vector definitions: */ @@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base, #endif } -static void check_timer_failed(struct timer_list *timer) -{ - static int whine_count; - if (whine_count < 16) { - whine_count++; - printk("Uninitialised timer!\n"); - printk("This is just a warning. Your computer is OK\n"); - printk("function=0x%p, data=0x%lx\n", - timer->function, timer->data); - dump_stack(); - } - /* - * Now fix it up - */ - timer->magic = TIMER_MAGIC; -} - -static inline void check_timer(struct timer_list *timer) -{ - if (timer->magic != TIMER_MAGIC) - check_timer_failed(timer); -} - - static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) { unsigned long expires = timer->expires; @@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; - timer->magic = TIMER_MAGIC; } EXPORT_SYMBOL(init_timer); @@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) int ret = 0; BUG_ON(!timer->function); - check_timer(timer); base = lock_timer_base(timer, &flags); @@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu) unsigned long flags; BUG_ON(timer_pending(timer) || !timer->function); - - check_timer(timer); - spin_lock_irqsave(&base->t_base.lock, flags); timer->base = &base->t_base; internal_add_timer(base, timer); @@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) { BUG_ON(!timer->function); - check_timer(timer); - /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; - check_timer(timer); - if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { @@ -412,8 +383,6 @@ out: */ int del_timer_sync(struct timer_list *timer) { - check_timer(timer); - for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -632,134 +601,118 @@ long time_next_adjust; */ static void second_overflow(void) { - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if ( time_maxerror > NTP_PHASE_LIMIT ) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at - * the end of the day, the system clock is set back one - * second; if in leap-delete state, the system clock is - * set ahead one second. The microtime() routine or - * external clock driver will insure that reported time - * is always monotonic. The ugly divides should be - * replaced. - */ - switch (time_state) { - - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - /* The timer interpolator will make time change gradually instead - * of an immediate jump by one second. - */ - time_interpolator_update(-NSEC_PER_SEC); - time_state = TIME_OOP; - clock_was_set(); - printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; } - break; - - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - /* Use of time interpolator for a gradual change of time */ - time_interpolator_update(NSEC_PER_SEC); - time_state = TIME_WAIT; - clock_was_set(); - printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. The microtime() + * routine or external clock driver will insure that reported time is + * always monotonic. The ugly divides should be replaced. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + /* + * The timer interpolator will make time change + * gradually instead of an immediate jump by one second + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; + clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second " + "23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + wall_to_monotonic.tv_sec--; + /* + * Use of time interpolator for a gradual change of + * time + */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; + clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second " + "23:59:59 UTC\n"); + } + break; + case TIME_OOP: + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; } - break; - - case TIME_OOP: - time_state = TIME_WAIT; - break; - - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In - * PLL mode, the offset is reduced by a fixed factor - * times the time constant. In FLL mode the offset is - * used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread - * the adjustment over not more than the number of - * seconds between updates. - */ - if (time_offset < 0) { - ltemp = -time_offset; - if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; - time_offset += ltemp; - time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } else { + + /* + * Compute the phase adjustment for the next second. In PLL mode, the + * offset is reduced by a fixed factor times the time constant. In FLL + * mode the offset is used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread the adjustment + * over not more than the number of seconds between updates. + */ ltemp = time_offset; if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + ltemp = shift_right(ltemp, SHIFT_KG + time_constant); + ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); + ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); time_offset -= ltemp; time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } - - /* - * Compute the frequency estimate and additional phase - * adjustment due to frequency error for the next - * second. When the PPS signal is engaged, gnaw on the - * watchdog counter and update the frequency computed by - * the pll and the PPS signal. - */ - pps_valid++; - if (pps_valid == PPS_VALID) { /* PPS signal lost */ - pps_jitter = MAXTIME; - pps_stabil = MAXFREQ; - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - } - ltemp = time_freq + pps_freq; - if (ltemp < 0) - time_adj -= -ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); - else - time_adj += ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + + /* + * Compute the frequency estimate and additional phase adjustment due + * to frequency error for the next second. When the PPS signal is + * engaged, gnaw on the watchdog counter and update the frequency + * computed by the pll and the PPS signal. + */ + pps_valid++; + if (pps_valid == PPS_VALID) { /* PPS signal lost */ + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; + time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); #if HZ == 100 - /* Compensate for (HZ==100) != (1 << SHIFT_HZ). - * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 2) + (-time_adj >> 5); - else - time_adj += (time_adj >> 2) + (time_adj >> 5); + /* + * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to + * get 128.125; => only 0.125% error (p. 14) + */ + time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); +#endif +#if HZ == 250 + /* + * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and + * 0.78125% to get 255.85938; => only 0.05% error (p. 14) + */ + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif #if HZ == 1000 - /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). - * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 6) + (-time_adj >> 7); - else - time_adj += (time_adj >> 6) + (time_adj >> 7); + /* + * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and + * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) + */ + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif } @@ -768,23 +721,20 @@ static void update_wall_time_one_tick(void) { long time_adjust_step, delta_nsec; - if ( (time_adjust_step = time_adjust) != 0 ) { - /* We are doing an adjtime thing. - * - * Prepare time_adjust_step to be within bounds. - * Note that a positive time_adjust means we want the clock - * to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - if (time_adjust > tickadj) - time_adjust_step = tickadj; - else if (time_adjust < -tickadj) - time_adjust_step = -tickadj; - - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; + if ((time_adjust_step = time_adjust) != 0 ) { + /* + * We are doing an adjtime thing. Prepare time_adjust_step to + * be within bounds. Note that a positive time_adjust means we + * want the clock to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + time_adjust_step = min(time_adjust_step, (long)tickadj); + time_adjust_step = max(time_adjust_step, (long)-tickadj); + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; } delta_nsec = tick_nsec + time_adjust_step * 1000; /* @@ -792,13 +742,8 @@ static void update_wall_time_one_tick(void) * advance the tick more. */ time_phase += time_adj; - if (time_phase <= -FINENSEC) { - long ltemp = -time_phase >> (SHIFT_SCALE - 10); - time_phase += ltemp << (SHIFT_SCALE - 10); - delta_nsec -= ltemp; - } - else if (time_phase >= FINENSEC) { - long ltemp = time_phase >> (SHIFT_SCALE - 10); + if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { + long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); time_phase -= ltemp << (SHIFT_SCALE - 10); delta_nsec += ltemp; } @@ -1128,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); + "value %lx from %p\n", timeout, + __builtin_return_address(0)); current->state = TASK_RUNNING; goto out; } @@ -1137,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; - - add_timer(&timer); + setup_timer(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire); schedule(); del_singleshot_timer_sync(&timer); @@ -1159,15 +1100,15 @@ EXPORT_SYMBOL(schedule_timeout); */ signed long __sched schedule_timeout_interruptible(signed long timeout) { - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); signed long __sched schedule_timeout_uninterruptible(signed long timeout) { - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); @@ -1507,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec) if (!time_interpolator) return; - /* The interpolator compensates for late ticks by accumulating - * the late time in time_interpolator->offset. A tick earlier than - * expected will lead to a reset of the offset and a corresponding - * jump of the clock forward. Again this only works if the - * interpolator clock is running slightly slower than the regular clock - * and the tuning logic insures that. - */ + /* + * The interpolator compensates for late ticks by accumulating the late + * time in time_interpolator->offset. A tick earlier than expected will + * lead to a reset of the offset and a corresponding jump of the clock + * forward. Again this only works if the interpolator clock is running + * slightly slower than the regular clock and the tuning logic insures + * that. + */ counter = time_interpolator_get_counter(1); - offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); + offset = time_interpolator->offset + + GET_TI_NSECS(counter, time_interpolator); if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) time_interpolator->offset = offset - delta_nsec; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 91bacb13a7e2..2bd5aee1c736 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -12,6 +12,8 @@ * Andrew Morton <andrewm@uow.edu.au> * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> + * + * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>. */ #include <linux/module.h> @@ -57,7 +59,7 @@ struct cpu_workqueue_struct { * per-CPU workqueues: */ struct workqueue_struct { - struct cpu_workqueue_struct cpu_wq[NR_CPUS]; + struct cpu_workqueue_struct *cpu_wq; const char *name; struct list_head list; /* Empty if single thread */ }; @@ -100,9 +102,9 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) if (!test_and_set_bit(0, &work->pending)) { if (unlikely(is_single_threaded(wq))) - cpu = 0; + cpu = any_online_cpu(cpu_online_map); BUG_ON(!list_empty(&work->entry)); - __queue_work(wq->cpu_wq + cpu, work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); ret = 1; } put_cpu(); @@ -116,9 +118,9 @@ static void delayed_work_timer_fn(unsigned long __data) int cpu = smp_processor_id(); if (unlikely(is_single_threaded(wq))) - cpu = 0; + cpu = any_online_cpu(cpu_online_map); - __queue_work(wq->cpu_wq + cpu, work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } int fastcall queue_delayed_work(struct workqueue_struct *wq, @@ -264,14 +266,14 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) might_sleep(); if (is_single_threaded(wq)) { - /* Always use cpu 0's area. */ - flush_cpu_workqueue(wq->cpu_wq + 0); + /* Always use first cpu's area. */ + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map))); } else { int cpu; lock_cpu_hotplug(); for_each_online_cpu(cpu) - flush_cpu_workqueue(wq->cpu_wq + cpu); + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); unlock_cpu_hotplug(); } } @@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, int cpu) { - struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); struct task_struct *p; spin_lock_init(&cwq->lock); @@ -312,12 +314,13 @@ struct workqueue_struct *__create_workqueue(const char *name, if (!wq) return NULL; + wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); wq->name = name; /* We don't need the distraction of CPUs appearing and vanishing. */ lock_cpu_hotplug(); if (singlethread) { INIT_LIST_HEAD(&wq->list); - p = create_workqueue_thread(wq, 0); + p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map)); if (!p) destroy = 1; else @@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) unsigned long flags; struct task_struct *p; - cwq = wq->cpu_wq + cpu; + cwq = per_cpu_ptr(wq->cpu_wq, cpu); spin_lock_irqsave(&cwq->lock, flags); p = cwq->thread; cwq->thread = NULL; @@ -371,7 +374,7 @@ void destroy_workqueue(struct workqueue_struct *wq) /* We don't need the distraction of CPUs appearing and vanishing. */ lock_cpu_hotplug(); if (is_single_threaded(wq)) - cleanup_workqueue_thread(wq, 0); + cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map)); else { for_each_online_cpu(cpu) cleanup_workqueue_thread(wq, cpu); @@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock(&workqueue_lock); } unlock_cpu_hotplug(); + free_percpu(wq->cpu_wq); kfree(wq); } @@ -458,7 +462,7 @@ int current_is_keventd(void) BUG_ON(!keventd_wq); - cwq = keventd_wq->cpu_wq + cpu; + cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); if (current == cwq->thread) ret = 1; @@ -470,7 +474,7 @@ int current_is_keventd(void) /* Take the work from this (downed) CPU. */ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { - struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); LIST_HEAD(list); struct work_struct *work; @@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) printk("Taking work for %s\n", wq->name); work = list_entry(list.next,struct work_struct,entry); list_del(&work->entry); - __queue_work(wq->cpu_wq + smp_processor_id(), work); + __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); } spin_unlock_irq(&cwq->lock); } @@ -508,16 +512,19 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE: /* Kick off worker threads. */ list_for_each_entry(wq, &workqueues, list) { - kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); - wake_up_process(wq->cpu_wq[hotcpu].thread); + struct cpu_workqueue_struct *cwq; + + cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); + kthread_bind(cwq->thread, hotcpu); + wake_up_process(cwq->thread); } break; case CPU_UP_CANCELED: list_for_each_entry(wq, &workqueues, list) { /* Unbind so it can run. */ - kthread_bind(wq->cpu_wq[hotcpu].thread, - smp_processor_id()); + kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, + any_online_cpu(cpu_online_map)); cleanup_workqueue_thread(wq, hotcpu); } break; |