diff options
Diffstat (limited to 'kernel')
60 files changed, 3526 insertions, 1313 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index d62ec66c1af2..5e3f3b75563a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ @@ -48,8 +48,9 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o -obj-$(CONFIG_TASKSTATS) += taskstats.o +obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index 2a7c933651c7..0aad5ca36a81 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -483,10 +483,14 @@ static void do_acct_process(struct file *file) ac.ac_ppid = current->parent->tgid; #endif - read_lock(&tasklist_lock); /* pin current->signal */ + mutex_lock(&tty_mutex); + /* FIXME: Whoever is responsible for current->signal locking needs + to use the same locking all over the kernel and document it */ + read_lock(&tasklist_lock); ac.ac_tty = current->signal->tty ? old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); + mutex_unlock(&tty_mutex); spin_lock_irq(¤t->sighand->siglock); ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); @@ -598,33 +602,3 @@ void acct_process(void) do_acct_process(file); fput(file); } - - -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) -{ - if (likely(tsk->mm)) { - long delta = - cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; - - if (delta == 0) - return; - tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - } -} - -/** - * acct_clear_integrals - clear the mm integral fields in task_struct - * @tsk: task_struct whose accounting fields are cleared - */ -void acct_clear_integrals(struct task_struct *tsk) -{ - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; -} diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 1a58a81fb09d..4f40d923af8e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -411,7 +411,6 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_FSGID: case AUDIT_LOGINUID: case AUDIT_PERS: - case AUDIT_ARCH: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: @@ -423,6 +422,14 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_ARG2: case AUDIT_ARG3: break; + /* arch is only allowed to be = or != */ + case AUDIT_ARCH: + if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) + && (f->op != AUDIT_NEGATE) && (f->op)) { + err = -EINVAL; + goto exit_free; + } + break; case AUDIT_PERM: if (f->val & ~15) goto exit_free; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fb83c5cb8c32..42f2f1179711 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -278,8 +278,11 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(tsk->pid, f->op, f->val); break; case AUDIT_PPID: - if (ctx) + if (ctx) { + if (!ctx->ppid) + ctx->ppid = sys_getppid(); result = audit_comparator(ctx->ppid, f->op, f->val); + } break; case AUDIT_UID: result = audit_comparator(tsk->uid, f->op, f->val); @@ -795,7 +798,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts /* tsk == current */ context->pid = tsk->pid; - context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ + if (!context->ppid) + context->ppid = sys_getppid(); context->uid = tsk->uid; context->gid = tsk->gid; context->euid = tsk->euid; @@ -817,6 +821,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); + + mutex_lock(&tty_mutex); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else @@ -838,6 +844,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->gid, context->euid, context->suid, context->fsuid, context->egid, context->sgid, context->fsgid, tty); + + mutex_unlock(&tty_mutex); + audit_log_task_info(ab, tsk); if (context->filterkey) { audit_log_format(ab, " key="); @@ -1132,6 +1141,7 @@ void audit_syscall_entry(int arch, int major, context->ctime = CURRENT_TIME; context->in_syscall = 1; context->auditable = !!(state == AUDIT_RECORD_CONTEXT); + context->ppid = 0; } /** @@ -1347,7 +1357,13 @@ void __audit_inode_child(const char *dname, const struct inode *inode, } update_context: - idx = context->name_count++; + idx = context->name_count; + if (context->name_count == AUDIT_NAMES) { + printk(KERN_DEBUG "name_count maxed and losing %s\n", + found_name ?: "(null)"); + return; + } + context->name_count++; #if AUDIT_DEBUG context->ino_count++; #endif @@ -1365,7 +1381,16 @@ update_context: /* A parent was not found in audit_names, so copy the inode data for the * provided parent. */ if (!found_name) { - idx = context->name_count++; + idx = context->name_count; + if (context->name_count == AUDIT_NAMES) { + printk(KERN_DEBUG + "name_count maxed and losing parent inode data: dev=%02x:%02x, inode=%lu", + MAJOR(parent->i_sb->s_dev), + MINOR(parent->i_sb->s_dev), + parent->i_ino); + return; + } + context->name_count++; #if AUDIT_DEBUG context->ino_count++; #endif diff --git a/kernel/capability.c b/kernel/capability.c index c7685ad00a97..edb845a6e84a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective, int found = 0; do_each_thread(g, target) { - if (target == current || target->pid == 1) + if (target == current || is_init(target)) continue; found = 1; if (security_capset_check(target, effective, inheritable, diff --git a/kernel/compat.c b/kernel/compat.c index 126dee9530aa..75573e5d27b0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -22,6 +22,7 @@ #include <linux/security.h> #include <linux/timex.h> #include <linux/migrate.h> +#include <linux/posix-timers.h> #include <asm/uaccess.h> @@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock, return err; } +static long compat_clock_nanosleep_restart(struct restart_block *restart) +{ + long err; + mm_segment_t oldfs; + struct timespec tu; + struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); + + restart->arg1 = (unsigned long) &tu; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = clock_nanosleep_restart(restart); + set_fs(oldfs); + + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && + put_compat_timespec(&tu, rmtp)) + return -EFAULT; + + if (err == -ERESTART_RESTARTBLOCK) { + restart->fn = compat_clock_nanosleep_restart; + restart->arg1 = (unsigned long) rmtp; + } + return err; +} + long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, struct compat_timespec __user *rqtp, struct compat_timespec __user *rmtp) @@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, long err; mm_segment_t oldfs; struct timespec in, out; + struct restart_block *restart; if (get_compat_timespec(&in, rqtp)) return -EFAULT; @@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, (struct timespec __user *) &in, (struct timespec __user *) &out); set_fs(oldfs); + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && put_compat_timespec(&out, rmtp)) return -EFAULT; + + if (err == -ERESTART_RESTARTBLOCK) { + restart = ¤t_thread_info()->restart_block; + restart->fn = compat_clock_nanosleep_restart; + restart->arg1 = (unsigned long) rmtp; + } return err; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cff41511269f..9d850ae13b1b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -240,7 +240,7 @@ static struct super_block *cpuset_sb; * A cpuset can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cpusets is empty. Since all * tasks in the system use _some_ cpuset, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cpuset + * least one task in the system (init), therefore, top_cpuset * always has either children cpusets and/or using tasks. So we don't * need a special hack to ensure that top_cpuset cannot be deleted. * @@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode) inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; @@ -378,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directories start off with i_nlink == 2 (for "." entry) */ - inode->i_nlink++; + inc_nlink(inode); } else { return -ENOMEM; } @@ -913,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) int fudge; int retval; + /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ + if (cs == &top_cpuset) + return -EACCES; + trialcs = *cs; retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) @@ -1222,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) task_lock(tsk); oldcs = tsk->cpuset; - if (!oldcs) { + /* + * After getting 'oldcs' cpuset ptr, be sure still not exiting. + * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack + * then fail this attach_task(), to avoid breaking top_cpuset.count. + */ + if (tsk->flags & PF_EXITING) { task_unlock(tsk); mutex_unlock(&callback_mutex); put_task_struct(tsk); @@ -1557,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) inode->i_fop = &simple_dir_operations; /* start off with i_nlink == 2 (for "." entry) */ - inode->i_nlink++; + inc_nlink(inode); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cpuset_file_operations; @@ -1590,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode) error = cpuset_create_file(dentry, S_IFDIR | mode); if (!error) { dentry->d_fsdata = cs; - parent->d_inode->i_nlink++; + inc_nlink(parent->d_inode); cs->dentry = dentry; } dput(dentry); @@ -2025,7 +2033,7 @@ int __init cpuset_init(void) } root = cpuset_mount->mnt_sb->s_root; root->d_fsdata = &top_cpuset; - root->d_inode->i_nlink++; + inc_nlink(root->d_inode); top_cpuset.dentry = root; root->d_inode->i_op = &cpuset_dir_inode_operations; number_of_cpusets = 1; @@ -2037,33 +2045,104 @@ out: return err; } +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) /* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * This handles CPU hotplug (cpuhp) events. If someday Memory - * Nodes can be hotplugged (dynamically changing node_online_map) - * then we should handle that too, perhaps in a similar way. + * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then the guarantee_online_cpus() + * or guarantee_online_mems() code will use that emptied cpusets + * parent online CPUs or nodes. Cpusets that were already empty of + * CPUs or nodes are left empty. + * + * This routine is intentionally inefficient in a couple of regards. + * It will check all cpusets in a subtree even if the top cpuset of + * the subtree has no offline CPUs or nodes. It checks both CPUs and + * nodes, even though the caller could have been coded to know that + * only one of CPUs or nodes needed to be checked on a given call. + * This was done to minimize text size rather than cpu cycles. + * + * Call with both manage_mutex and callback_mutex held. + * + * Recursive, on depth of cpuset subtree. */ -#ifdef CONFIG_HOTPLUG_CPU -static int cpuset_handle_cpuhp(struct notifier_block *nb, - unsigned long phase, void *cpu) +static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) +{ + struct cpuset *c; + + /* Each of our child cpusets mems must be online */ + list_for_each_entry(c, &cur->children, sibling) { + guarantee_online_cpus_mems_in_subtree(c); + if (!cpus_empty(c->cpus_allowed)) + guarantee_online_cpus(c, &c->cpus_allowed); + if (!nodes_empty(c->mems_allowed)) + guarantee_online_mems(c, &c->mems_allowed); + } +} + +/* + * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track + * cpu_online_map and node_online_map. Force the top cpuset to track + * whats online after any CPU or memory node hotplug or unplug event. + * + * To ensure that we don't remove a CPU or node from the top cpuset + * that is currently in use by a child cpuset (which would violate + * the rule that cpusets must be subsets of their parent), we first + * call the recursive routine guarantee_online_cpus_mems_in_subtree(). + * + * Since there are two callers of this routine, one for CPU hotplug + * events and one for memory node hotplug events, we could have coded + * two separate routines here. We code it as a single common routine + * in order to minimize text size. + */ + +static void common_cpu_mem_hotplug_unplug(void) { mutex_lock(&manage_mutex); mutex_lock(&callback_mutex); + guarantee_online_cpus_mems_in_subtree(&top_cpuset); top_cpuset.cpus_allowed = cpu_online_map; + top_cpuset.mems_allowed = node_online_map; mutex_unlock(&callback_mutex); mutex_unlock(&manage_mutex); +} +#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* + * The top_cpuset tracks what CPUs and Memory Nodes are online, + * period. This is necessary in order to make cpusets transparent + * (of no affect) on systems that are actively using CPU hotplug + * but making no active use of cpusets. + * + * This routine ensures that top_cpuset.cpus_allowed tracks + * cpu_online_map on each CPU hotplug (cpuhp) event. + */ +static int cpuset_handle_cpuhp(struct notifier_block *nb, + unsigned long phase, void *cpu) +{ + common_cpu_mem_hotplug_unplug(); return 0; } #endif +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Keep top_cpuset.mems_allowed tracking node_online_map. + * Call this routine anytime after you change node_online_map. + * See also the previous routine cpuset_handle_cpuhp(). + */ + +void cpuset_track_online_nodes() +{ + common_cpu_mem_hotplug_unplug(); +} +#endif + /** * cpuset_init_smp - initialize cpus_allowed * diff --git a/kernel/dma.c b/kernel/dma.c index aef0a45b7893..2020644c938a 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -62,6 +62,11 @@ static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { }; +/** + * request_dma - request and reserve a system DMA channel + * @dmanr: DMA channel number + * @device_id: reserving device ID string, used in /proc/dma + */ int request_dma(unsigned int dmanr, const char * device_id) { if (dmanr >= MAX_DMA_CHANNELS) @@ -76,7 +81,10 @@ int request_dma(unsigned int dmanr, const char * device_id) return 0; } /* request_dma */ - +/** + * free_dma - free a reserved system DMA channel + * @dmanr: DMA channel number + */ void free_dma(unsigned int dmanr) { if (dmanr >= MAX_DMA_CHANNELS) { diff --git a/kernel/exit.c b/kernel/exit.c index d891883420f7..f250a5e3e281 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -18,8 +18,10 @@ #include <linux/security.h> #include <linux/cpu.h> #include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/binfmts.h> +#include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> @@ -38,6 +40,7 @@ #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> +#include <linux/blkdev.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -219,7 +222,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->exit_state - || p->real_parent->pid == 1) + || is_init(p->real_parent)) continue; if (process_group(p->real_parent) != pgrp && p->real_parent->signal->session == p->signal->session) { @@ -249,17 +252,6 @@ static int has_stopped_jobs(int pgrp) do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p->state != TASK_STOPPED) continue; - - /* If p is stopped by a debugger on a signal that won't - stop it, then don't count p as stopped. This isn't - perfect but it's a good approximation. */ - if (unlikely (p->ptrace) - && p->exit_code != SIGSTOP - && p->exit_code != SIGTSTP - && p->exit_code != SIGTTOU - && p->exit_code != SIGTTIN) - continue; - retval = 1; break; } while_each_task_pid(pgrp, PIDTYPE_PGID, p); @@ -292,9 +284,7 @@ static void reparent_to_init(void) /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; - if ((current->policy == SCHED_NORMAL || - current->policy == SCHED_BATCH) - && (task_nice(current) < 0)) + if (!has_rt_policy(current) && (task_nice(current) < 0)) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ @@ -408,9 +398,11 @@ void daemonize(const char *name, ...) fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); - exit_namespace(current); - current->namespace = init_task.namespace; - get_namespace(current->namespace); + + exit_task_namespaces(current); + current->nsproxy = init_task.nsproxy; + get_task_namespaces(current); + exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -487,6 +479,18 @@ void fastcall put_files_struct(struct files_struct *files) EXPORT_SYMBOL(put_files_struct); +void reset_files_struct(struct task_struct *tsk, struct files_struct *files) +{ + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} +EXPORT_SYMBOL(reset_files_struct); + static inline void __exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -916,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code) exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); - exit_namespace(tsk); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); @@ -931,6 +934,7 @@ fastcall NORET_TYPE void do_exit(long code) tsk->exit_code = code; proc_exit_connector(tsk); exit_notify(tsk); + exit_task_namespaces(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; @@ -954,15 +958,15 @@ fastcall NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - /* PF_DEAD causes final put_task_struct after we schedule. */ preempt_disable(); - BUG_ON(tsk->flags & PF_DEAD); - tsk->flags |= PF_DEAD; + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; schedule(); BUG(); /* Avoid "noreturn function does return". */ - for (;;) ; + for (;;) + cpu_relax(); /* For when BUG is null */ } EXPORT_SYMBOL_GPL(do_exit); @@ -971,7 +975,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); - + do_exit(code); } diff --git a/kernel/fork.c b/kernel/fork.c index f9b014e3e700..7dc6140baac6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -27,6 +27,7 @@ #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/fs.h> +#include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -42,9 +43,11 @@ #include <linux/profile.h> #include <linux/rmap.h> #include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> +#include <linux/random.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -175,10 +178,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->thread_info = ti; setup_thread_stack(tsk, orig); +#ifdef CONFIG_CC_STACKPROTECTOR + tsk->stack_canary = get_random_int(); +#endif + /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); +#ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; +#endif tsk->splice_pipe = NULL; return tsk; } @@ -1056,7 +1065,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + p->hardirqs_enabled = 1; +#else p->hardirqs_enabled = 0; +#endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; @@ -1104,11 +1117,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; - if ((retval = copy_namespace(clone_flags, p))) + if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* @@ -1139,7 +1152,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ @@ -1162,6 +1174,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); + /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ + p->ioprio = current->ioprio; + /* * The task hasn't been attached yet, so its cpus_allowed mask will * not be changed, nor will its assigned CPU. @@ -1198,7 +1213,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; } if (clone_flags & CLONE_THREAD) { @@ -1221,11 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } - /* - * inherit ioprio - */ - p->ioprio = current->ioprio; - if (likely(p->pid)) { add_parent(p); if (unlikely(p->ptrace & PT_PTRACED)) @@ -1251,8 +1261,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); return p; -bad_fork_cleanup_namespace: - exit_namespace(p); +bad_fork_cleanup_namespaces: + exit_task_namespaces(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: @@ -1505,10 +1515,9 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) */ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->namespace; + struct namespace *ns = current->nsproxy->namespace; - if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) { + if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1580,6 +1589,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } +#ifndef CONFIG_IPC_NS +static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) +{ + if (flags & CLONE_NEWIPC) + return -EINVAL; + + return 0; +} +#endif + /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1597,13 +1616,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; + struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; + struct uts_namespace *uts, *new_uts = NULL; + struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); /* Return -EINVAL for all unsupported flags */ err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) @@ -1620,11 +1643,30 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; + if ((err = unshare_utsname(unshare_flags, &new_uts))) + goto bad_unshare_cleanup_semundo; + if ((err = unshare_ipcs(unshare_flags, &new_ipc))) + goto bad_unshare_cleanup_uts; + + if (new_ns || new_uts || new_ipc) { + old_nsproxy = current->nsproxy; + new_nsproxy = dup_namespaces(old_nsproxy); + if (!new_nsproxy) { + err = -ENOMEM; + goto bad_unshare_cleanup_ipc; + } + } - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + new_uts || new_ipc) { task_lock(current); + if (new_nsproxy) { + current->nsproxy = new_nsproxy; + new_nsproxy = old_nsproxy; + } + if (new_fs) { fs = current->fs; current->fs = new_fs; @@ -1632,8 +1674,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->namespace; - current->namespace = new_ns; + ns = current->nsproxy->namespace; + current->nsproxy->namespace = new_ns; new_ns = ns; } @@ -1658,9 +1700,33 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } + if (new_uts) { + uts = current->nsproxy->uts_ns; + current->nsproxy->uts_ns = new_uts; + new_uts = uts; + } + + if (new_ipc) { + ipc = current->nsproxy->ipc_ns; + current->nsproxy->ipc_ns = new_ipc; + new_ipc = ipc; + } + task_unlock(current); } + if (new_nsproxy) + put_nsproxy(new_nsproxy); + +bad_unshare_cleanup_ipc: + if (new_ipc) + put_ipc_ns(new_ipc); + +bad_unshare_cleanup_uts: + if (new_uts) + put_uts_ns(new_uts); + +bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/futex.c b/kernel/futex.c index 9d260e838cff..4aaf91951a43 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -389,7 +389,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (!p) goto out_unlock; @@ -403,7 +403,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) } get_task_struct(p); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return p; } @@ -1527,7 +1527,7 @@ static int futex_fd(u32 __user *uaddr, int signal) filp->f_mapping = filp->f_dentry->d_inode->i_mapping; if (signal) { - err = f_setown(filp, current->pid, 1); + err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); if (err < 0) { goto error; } @@ -1624,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, struct task_struct *p; ret = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (!p) goto err_unlock; @@ -1633,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; - read_unlock(&tasklist_lock); + rcu_read_unlock(); } if (put_user(sizeof(*head), len_ptr)) @@ -1641,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, return put_user(head, head_ptr); err_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 21c38a7e666b..d0ba190dfeb6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod return t->task == NULL; } -static long __sched nanosleep_restart(struct restart_block *restart) +long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; struct timespec __user *rmtp; @@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart) restart->fn = do_no_restart_syscall; - hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); - t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; + hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); + t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; if (do_nanosleep(&t, HRTIMER_ABS)) return 0; - rmtp = (struct timespec __user *) restart->arg2; + rmtp = (struct timespec __user *) restart->arg1; if (rmtp) { time = ktime_sub(t.timer.expires, t.timer.base->get_time()); if (time.tv64 <= 0) @@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) return -EFAULT; } - restart->fn = nanosleep_restart; + restart->fn = hrtimer_nanosleep_restart; /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; @@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, } restart = ¤t_thread_info()->restart_block; - restart->fn = nanosleep_restart; - restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; - restart->arg1 = t.timer.expires.tv64 >> 32; - restart->arg2 = (unsigned long) rmtp; - restart->arg3 = (unsigned long) t.timer.base->index; + restart->fn = hrtimer_nanosleep_restart; + restart->arg0 = (unsigned long) t.timer.base->index; + restart->arg1 = (unsigned long) rmtp; + restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; + restart->arg3 = t.timer.expires.tv64 >> 32; return -ERESTART_RESTARTBLOCK; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ac1f850d4937..4cf65f5c6a74 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,6 +18,69 @@ #include "internals.h" /** + * dynamic_irq_init - initialize a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_init(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); + WARN_ON(1); + return; + } + + /* Ensure we don't have left over values from a previous use of this irq */ + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->status = IRQ_DISABLED; + desc->chip = &no_irq_chip; + desc->handle_irq = handle_bad_irq; + desc->depth = 1; + desc->handler_data = NULL; + desc->chip_data = NULL; + desc->action = NULL; + desc->irq_count = 0; + desc->irqs_unhandled = 0; +#ifdef CONFIG_SMP + desc->affinity = CPU_MASK_ALL; +#endif + spin_unlock_irqrestore(&desc->lock, flags); +} + +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); + WARN_ON(1); + return; + } + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + if (desc->action) { + spin_unlock_irqrestore(&desc->lock, flags); + printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n", + irq); + WARN_ON(1); + return; + } + desc->handle_irq = handle_bad_irq; + desc->chip = &no_irq_chip; + spin_unlock_irqrestore(&desc->lock, flags); +} + + +/** * set_irq_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure @@ -40,10 +103,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; - /* - * For compatibility only: - */ - desc->chip = chip; spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -146,7 +205,7 @@ static void default_disable(unsigned int irq) struct irq_desc *desc = irq_desc + irq; if (!(desc->status & IRQ_DELAYED_DISABLE)) - irq_desc[irq].chip->mask(irq); + desc->chip->mask(irq); } /* diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index a57ebe9fa6f6..4baa3bbcd25a 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,17 +7,17 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) unsigned long flags; spin_lock_irqsave(&desc->lock, flags); - desc->move_irq = 1; + desc->status |= IRQ_MOVE_PENDING; irq_desc[irq].pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } -void move_native_irq(int irq) +void move_masked_irq(int irq) { struct irq_desc *desc = irq_desc + irq; cpumask_t tmp; - if (likely(!desc->move_irq)) + if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; /* @@ -28,7 +28,7 @@ void move_native_irq(int irq) return; } - desc->move_irq = 0; + desc->status &= ~IRQ_MOVE_PENDING; if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) return; @@ -48,15 +48,29 @@ void move_native_irq(int irq) * when an active trigger is comming in. This could * cause some ioapics to mal-function. * Being paranoid i guess! + * + * For correct operation this depends on the caller + * masking the irqs. */ if (likely(!cpus_empty(tmp))) { - if (likely(!(desc->status & IRQ_DISABLED))) - desc->chip->disable(irq); - desc->chip->set_affinity(irq,tmp); - - if (likely(!(desc->status & IRQ_DISABLED))) - desc->chip->enable(irq); } cpus_clear(irq_desc[irq].pending_mask); } + +void move_native_irq(int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + if (likely(!(desc->status & IRQ_MOVE_PENDING))) + return; + + if (likely(!(desc->status & IRQ_DISABLED))) + desc->chip->disable(irq); + + move_masked_irq(irq); + + if (likely(!(desc->status & IRQ_DISABLED))) + desc->chip->enable(irq); +} + diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab16a5a4cfe9..eeac3e313b2b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -69,6 +69,15 @@ static inline int is_kernel(unsigned long addr) return in_gate_area_no_task(addr); } +static int is_ksym_addr(unsigned long addr) +{ + if (all_var) + return is_kernel(addr); + + return is_kernel_text(addr) || is_kernel_inittext(addr) || + is_kernel_extratext(addr); +} + /* expand a compressed symbol data into the resulting uncompressed string, given the offset to where the symbol is in the compressed stream */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) @@ -154,7 +163,73 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); + +static unsigned long get_symbol_pos(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset) +{ + unsigned long symbol_start = 0, symbol_end = 0; + unsigned long i, low, high, mid; + + /* This kernel should never had been booted. */ + BUG_ON(!kallsyms_addresses); + + /* do a binary search on the sorted kallsyms_addresses array */ + low = 0; + high = kallsyms_num_syms; + + while (high - low > 1) { + mid = (low + high) / 2; + if (kallsyms_addresses[mid] <= addr) + low = mid; + else + high = mid; + } + + /* + * search for the first aliased symbol. Aliased + * symbols are symbols with the same address + */ + while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + --low; + + symbol_start = kallsyms_addresses[low]; + + /* Search for next non-aliased symbol */ + for (i = low + 1; i < kallsyms_num_syms; i++) { + if (kallsyms_addresses[i] > symbol_start) { + symbol_end = kallsyms_addresses[i]; + break; + } + } + + /* if we found no next symbol, we use the end of the section */ + if (!symbol_end) { + if (is_kernel_inittext(addr)) + symbol_end = (unsigned long)_einittext; + else if (all_var) + symbol_end = (unsigned long)_end; + else + symbol_end = (unsigned long)_etext; + } + + *symbolsize = symbol_end - symbol_start; + *offset = addr - symbol_start; + + return low; +} + +/* + * Lookup an address but don't bother to find any names. + */ +int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, + unsigned long *offset) +{ + if (is_ksym_addr(addr)) + return !!get_symbol_pos(addr, symbolsize, offset); + + return !!module_address_lookup(addr, symbolsize, offset, NULL); +} /* * Lookup an address @@ -168,57 +243,18 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { - unsigned long i, low, high, mid; const char *msym; - /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); - namebuf[KSYM_NAME_LEN] = 0; namebuf[0] = 0; - if ((all_var && is_kernel(addr)) || - (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) || - is_kernel_extratext(addr)))) { - unsigned long symbol_end = 0; - - /* do a binary search on the sorted kallsyms_addresses array */ - low = 0; - high = kallsyms_num_syms; - - while (high-low > 1) { - mid = (low + high) / 2; - if (kallsyms_addresses[mid] <= addr) low = mid; - else high = mid; - } - - /* search for the first aliased symbol. Aliased symbols are - symbols with the same address */ - while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low]) - --low; + if (is_ksym_addr(addr)) { + unsigned long pos; + pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(low), namebuf); - - /* Search for next non-aliased symbol */ - for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > kallsyms_addresses[low]) { - symbol_end = kallsyms_addresses[i]; - break; - } - } - - /* if we found no next symbol, we use the end of the section */ - if (!symbol_end) { - if (is_kernel_inittext(addr)) - symbol_end = (unsigned long)_einittext; - else - symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext; - } - - *symbolsize = symbol_end - kallsyms_addresses[low]; + kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); *modname = NULL; - *offset = addr - kallsyms_addresses[low]; return namebuf; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 50087ecf337e..fcdd5d2bc3f4 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -40,7 +40,7 @@ struct resource crashk_res = { int kexec_should_crash(struct task_struct *p) { - if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) + if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) return 1; return 0; } @@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: - xchg(&kexec_lock, 0); /* Release the mutex */ + locked = xchg(&kexec_lock, 0); /* Release the mutex */ + BUG_ON(!locked); kimage_free(image); return result; @@ -1061,7 +1062,8 @@ void crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - xchg(&kexec_lock, 0); + locked = xchg(&kexec_lock, 0); + BUG_ON(!locked); } } diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 64ab045c3d9d..5d1d907378a2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo, len = min(len, fifo->size - fifo->in + fifo->out); + /* + * Ensure that we sample the fifo->out index -before- we + * start putting bytes into the kfifo. + */ + + smp_mb(); + /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); @@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo, /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, buffer + l, len - l); + /* + * Ensure that we add the bytes to the kfifo -before- + * we update the fifo->in index. + */ + + smp_wmb(); + fifo->in += len; return len; @@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo, len = min(len, fifo->in - fifo->out); + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); @@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo, /* then get the rest (if any) from the beginning of the buffer */ memcpy(buffer + l, fifo->buffer, len - l); + /* + * Ensure that we remove the bytes from the kfifo -before- + * we update the fifo->out index. + */ + + smp_mb(); + fifo->out += len; return len; diff --git a/kernel/kmod.c b/kernel/kmod.c index 5c470c57fb57..bb4e29d924e4 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -18,8 +18,6 @@ call_usermodehelper wait flag, and remove exec_usermodehelper. Rusty Russell <rusty@rustcorp.com.au> Jan 2003 */ -#define __KERNEL_SYSCALLS__ - #include <linux/module.h> #include <linux/sched.h> #include <linux/syscalls.h> @@ -35,6 +33,7 @@ #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/resource.h> #include <asm/uaccess.h> extern int max_threads; @@ -122,6 +121,7 @@ struct subprocess_info { struct key *ring; int wait; int retval; + struct file *stdin; }; /* @@ -145,12 +145,30 @@ static int ____call_usermodehelper(void *data) key_put(old_session); + /* Install input pipe when needed */ + if (sub_info->stdin) { + struct files_struct *f = current->files; + struct fdtable *fdt; + /* no races because files should be private here */ + sys_close(0); + fd_install(0, sub_info->stdin); + spin_lock(&f->file_lock); + fdt = files_fdtable(f); + FD_SET(0, fdt->open_fds); + FD_CLR(0, fdt->close_on_exec); + spin_unlock(&f->file_lock); + + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0}; + } + /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed(current, CPU_MASK_ALL); retval = -EPERM; if (current->fs->root) - retval = execve(sub_info->path, sub_info->argv,sub_info->envp); + retval = kernel_execve(sub_info->path, + sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; @@ -176,6 +194,8 @@ static int wait_for_helper(void *data) if (pid < 0) { sub_info->retval = pid; } else { + int ret; + /* * Normally it is bogus to call wait4() from in-kernel because * wait4() wants to write the exit code to a userspace address. @@ -185,7 +205,15 @@ static int wait_for_helper(void *data) * * Thus the __user pointer cast is valid here. */ - sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL); + sys_wait4(pid, (int __user *)&ret, 0, NULL); + + /* + * If ret is 0, either ____call_usermodehelper failed and the + * real error code is already in sub_info->retval or + * sub_info->retval is 0 anyway, so don't mess with it then. + */ + if (ret) + sub_info->retval = ret; } complete(sub_info->complete); @@ -258,6 +286,44 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, } EXPORT_SYMBOL(call_usermodehelper_keys); +int call_usermodehelper_pipe(char *path, char **argv, char **envp, + struct file **filp) +{ + DECLARE_COMPLETION(done); + struct subprocess_info sub_info = { + .complete = &done, + .path = path, + .argv = argv, + .envp = envp, + .retval = 0, + }; + struct file *f; + DECLARE_WORK(work, __call_usermodehelper, &sub_info); + + if (!khelper_wq) + return -EBUSY; + + if (path[0] == '\0') + return 0; + + f = create_write_pipe(); + if (!f) + return -ENOMEM; + *filp = f; + + f = create_read_pipe(f); + if (!f) { + free_write_pipe(*filp); + return -ENOMEM; + } + sub_info.stdin = f; + + queue_work(khelper_wq, &work); + wait_for_completion(&done); + return sub_info.retval; +} +EXPORT_SYMBOL(call_usermodehelper_pipe); + void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3f57dfdc8f92..610c837ad9e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/kallsyms.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> @@ -45,6 +46,16 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +/* + * Some oddball architectures like 64bit powerpc have function descriptors + * so this must be overridable. + */ +#ifndef kprobe_lookup_name +#define kprobe_lookup_name(name, addr) \ + addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) +#endif + static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static atomic_t kprobe_count; @@ -308,7 +319,8 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri) } /* Called with kretprobe_lock held */ -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -320,7 +332,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->free_instances); } else /* Unregistering */ - kfree(ri); + hlist_add_head(&ri->hlist, head); } struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) @@ -336,18 +348,24 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) */ void __kprobes kprobe_flush_task(struct task_struct *tk) { - struct kretprobe_instance *ri; - struct hlist_head *head; + struct kretprobe_instance *ri; + struct hlist_head *head, empty_rp; struct hlist_node *node, *tmp; unsigned long flags = 0; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(tk); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri); - } + head = kretprobe_inst_table_head(tk); + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task == tk) + recycle_rp_inst(ri, &empty_rp); + } spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } } static inline void free_rp_inst(struct kretprobe *rp) @@ -447,6 +465,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, struct kprobe *old_p; struct module *probed_mod; + /* + * If we have a symbol_name argument look it up, + * and add it to the address. That way the addr + * field can either be global or relative to a symbol. + */ + if (p->symbol_name) { + if (p->addr) + return -EINVAL; + kprobe_lookup_name(p->symbol_name, p->addr); + } + + if (!p->addr) + return -EINVAL; + p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); + if ((!kernel_text_address((unsigned long) p->addr)) || in_kprobes_functions((unsigned long) p->addr)) return -EINVAL; @@ -488,7 +521,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, (ARCH_INACTIVE_KPROBE_COUNT + 1)) register_page_fault_notifier(&kprobe_page_fault_nb); - arch_arm_kprobe(p); + arch_arm_kprobe(p); out: mutex_unlock(&kprobe_mutex); diff --git a/kernel/latency.c b/kernel/latency.c new file mode 100644 index 000000000000..258f2555abbc --- /dev/null +++ b/kernel/latency.c @@ -0,0 +1,279 @@ +/* + * latency.c: Explicit system-wide latency-expectation infrastructure + * + * The purpose of this infrastructure is to allow device drivers to set + * latency constraint they have and to collect and summarize these + * expectations globally. The cummulated result can then be used by + * power management and similar users to make decisions that have + * tradoffs with a latency component. + * + * An example user of this are the x86 C-states; each higher C state saves + * more power, but has a higher exit latency. For the idle loop power + * code to make a good decision which C-state to use, information about + * acceptable latencies is required. + * + * An example announcer of latency is an audio driver that knowns it + * will get an interrupt when the hardware has 200 usec of samples + * left in the DMA buffer; in that case the driver can set a latency + * constraint of, say, 150 usec. + * + * Multiple drivers can each announce their maximum accepted latency, + * to keep these appart, a string based identifier is used. + * + * + * (C) Copyright 2006 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/latency.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <asm/atomic.h> + +struct latency_info { + struct list_head list; + int usecs; + char *identifier; +}; + +/* + * locking rule: all modifications to current_max_latency and + * latency_list need to be done while holding the latency_lock. + * latency_lock needs to be taken _irqsave. + */ +static atomic_t current_max_latency; +static DEFINE_SPINLOCK(latency_lock); + +static LIST_HEAD(latency_list); +static BLOCKING_NOTIFIER_HEAD(latency_notifier); + +/* + * This function returns the maximum latency allowed, which + * happens to be the minimum of all maximum latencies on the + * list. + */ +static int __find_max_latency(void) +{ + int min = INFINITE_LATENCY; + struct latency_info *info; + + list_for_each_entry(info, &latency_list, list) { + if (info->usecs < min) + min = info->usecs; + } + return min; +} + +/** + * set_acceptable_latency - sets the maximum latency acceptable + * @identifier: string that identifies this driver + * @usecs: maximum acceptable latency for this driver + * + * This function informs the kernel that this device(driver) + * can accept at most usecs latency. This setting is used for + * power management and similar tradeoffs. + * + * This function sleeps and can only be called from process + * context. + * Calling this function with an existing identifier is valid + * and will cause the existing latency setting to be changed. + */ +void set_acceptable_latency(char *identifier, int usecs) +{ + struct latency_info *info, *iter; + unsigned long flags; + int found_old = 0; + + info = kzalloc(sizeof(struct latency_info), GFP_KERNEL); + if (!info) + return; + info->usecs = usecs; + info->identifier = kstrdup(identifier, GFP_KERNEL); + if (!info->identifier) + goto free_info; + + spin_lock_irqsave(&latency_lock, flags); + list_for_each_entry(iter, &latency_list, list) { + if (strcmp(iter->identifier, identifier)==0) { + found_old = 1; + iter->usecs = usecs; + break; + } + } + if (!found_old) + list_add(&info->list, &latency_list); + + if (usecs < atomic_read(¤t_max_latency)) + atomic_set(¤t_max_latency, usecs); + + spin_unlock_irqrestore(&latency_lock, flags); + + blocking_notifier_call_chain(&latency_notifier, + atomic_read(¤t_max_latency), NULL); + + /* + * if we inserted the new one, we're done; otherwise there was + * an existing one so we need to free the redundant data + */ + if (!found_old) + return; + + kfree(info->identifier); +free_info: + kfree(info); +} +EXPORT_SYMBOL_GPL(set_acceptable_latency); + +/** + * modify_acceptable_latency - changes the maximum latency acceptable + * @identifier: string that identifies this driver + * @usecs: maximum acceptable latency for this driver + * + * This function informs the kernel that this device(driver) + * can accept at most usecs latency. This setting is used for + * power management and similar tradeoffs. + * + * This function does not sleep and can be called in any context. + * Trying to use a non-existing identifier silently gets ignored. + * + * Due to the atomic nature of this function, the modified latency + * value will only be used for future decisions; past decisions + * can still lead to longer latencies in the near future. + */ +void modify_acceptable_latency(char *identifier, int usecs) +{ + struct latency_info *iter; + unsigned long flags; + + spin_lock_irqsave(&latency_lock, flags); + list_for_each_entry(iter, &latency_list, list) { + if (strcmp(iter->identifier, identifier) == 0) { + iter->usecs = usecs; + break; + } + } + if (usecs < atomic_read(¤t_max_latency)) + atomic_set(¤t_max_latency, usecs); + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL_GPL(modify_acceptable_latency); + +/** + * remove_acceptable_latency - removes the maximum latency acceptable + * @identifier: string that identifies this driver + * + * This function removes a previously set maximum latency setting + * for the driver and frees up any resources associated with the + * bookkeeping needed for this. + * + * This function does not sleep and can be called in any context. + * Trying to use a non-existing identifier silently gets ignored. + */ +void remove_acceptable_latency(char *identifier) +{ + unsigned long flags; + int newmax = 0; + struct latency_info *iter, *temp; + + spin_lock_irqsave(&latency_lock, flags); + + list_for_each_entry_safe(iter, temp, &latency_list, list) { + if (strcmp(iter->identifier, identifier) == 0) { + list_del(&iter->list); + newmax = iter->usecs; + kfree(iter->identifier); + kfree(iter); + break; + } + } + + /* If we just deleted the system wide value, we need to + * recalculate with a full search + */ + if (newmax == atomic_read(¤t_max_latency)) { + newmax = __find_max_latency(); + atomic_set(¤t_max_latency, newmax); + } + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL_GPL(remove_acceptable_latency); + +/** + * system_latency_constraint - queries the system wide latency maximum + * + * This function returns the system wide maximum latency in + * microseconds. + * + * This function does not sleep and can be called in any context. + */ +int system_latency_constraint(void) +{ + return atomic_read(¤t_max_latency); +} +EXPORT_SYMBOL_GPL(system_latency_constraint); + +/** + * synchronize_acceptable_latency - recalculates all latency decisions + * + * This function will cause a callback to various kernel pieces that + * will make those pieces rethink their latency decisions. This implies + * that if there are overlong latencies in hardware state already, those + * latencies get taken right now. When this call completes no overlong + * latency decisions should be active anymore. + * + * Typical usecase of this is after a modify_acceptable_latency() call, + * which in itself is non-blocking and non-synchronizing. + * + * This function blocks and should not be called with locks held. + */ + +void synchronize_acceptable_latency(void) +{ + blocking_notifier_call_chain(&latency_notifier, + atomic_read(¤t_max_latency), NULL); +} +EXPORT_SYMBOL_GPL(synchronize_acceptable_latency); + +/* + * Latency notifier: this notifier gets called when a non-atomic new + * latency value gets set. The expectation nof the caller of the + * non-atomic set is that when the call returns, future latencies + * are within bounds, so the functions on the notifier list are + * expected to take the overlong latencies immediately, inside the + * callback, and not make a overlong latency decision anymore. + * + * The callback gets called when the new latency value is made + * active so system_latency_constraint() returns the new latency. + */ +int register_latency_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_register(&latency_notifier, nb); +} +EXPORT_SYMBOL_GPL(register_latency_notifier); + +int unregister_latency_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_unregister(&latency_notifier, nb); +} +EXPORT_SYMBOL_GPL(unregister_latency_notifier); + +static __init int latency_init(void) +{ + atomic_set(¤t_max_latency, INFINITE_LATENCY); + /* + * we don't want by default to have longer latencies than 2 ticks, + * since that would cause lost ticks + */ + set_acceptable_latency("kernel", 2*1000000/HZ); + return 0; +} + +module_init(latency_init); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 9bad17884513..4c0553461000 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -36,6 +36,7 @@ #include <linux/stacktrace.h> #include <linux/debug_locks.h> #include <linux/irqflags.h> +#include <linux/utsname.h> #include <asm/sections.h> @@ -121,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE]; * unique. */ #define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \ + (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ + ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) void lockdep_off(void) @@ -224,7 +225,14 @@ static int save_trace(struct stack_trace *trace) trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; trace->entries = stack_trace + nr_stack_trace_entries; - save_stack_trace(trace, NULL, 0, 3); + trace->skip = 3; + trace->all_contexts = 0; + + /* Make sure to not recurse in case the the unwinder needs to tak +e locks. */ + lockdep_off(); + save_stack_trace(trace, NULL); + lockdep_on(); trace->max_entries = trace->nr_entries; @@ -508,6 +516,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) return 0; } +static void print_kernel_version(void) +{ + printk("%s %.*s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); +} + /* * When a circular dependency is detected, print the * header first: @@ -524,6 +539,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) printk("\n=======================================================\n"); printk( "[ INFO: possible circular locking dependency detected ]\n"); + print_kernel_version(); printk( "-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, curr->pid); @@ -705,6 +721,7 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\n======================================================\n"); printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); + print_kernel_version(); printk( "------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, curr->pid, @@ -786,6 +803,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, printk("\n=============================================\n"); printk( "[ INFO: possible recursive locking detected ]\n"); + print_kernel_version(); printk( "---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, curr->pid); @@ -1368,6 +1386,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, printk("\n=========================================================\n"); printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); + print_kernel_version(); printk( "---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, curr->pid); @@ -1462,6 +1481,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, printk("\n=================================\n"); printk( "[ INFO: inconsistent lock state ]\n"); + print_kernel_version(); printk( "---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", diff --git a/kernel/module.c b/kernel/module.c index b7fe6e840963..7f60e782de1e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -851,6 +851,7 @@ static int check_version(Elf_Shdr *sechdrs, printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; } return 1; } @@ -933,6 +934,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr, return sprintf(buf, "0x%lx\n", sattr->address); } +static void free_sect_attrs(struct module_sect_attrs *sect_attrs) +{ + int section; + + for (section = 0; section < sect_attrs->nsections; section++) + kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs); +} + static void add_sect_attrs(struct module *mod, unsigned int nsect, char *secstrings, Elf_Shdr *sechdrs) { @@ -949,21 +959,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, + nloaded * sizeof(sect_attrs->attrs[0]), sizeof(sect_attrs->grp.attrs[0])); size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); - if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) + sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + if (sect_attrs == NULL) return; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; for (i = 0; i < nsect; i++) { if (! (sechdrs[i].sh_flags & SHF_ALLOC)) continue; sattr->address = sechdrs[i].sh_addr; - strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, - MODULE_SECT_NAME_LEN); + sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, + GFP_KERNEL); + if (sattr->name == NULL) + goto out; + sect_attrs->nsections++; sattr->mattr.show = module_sect_show; sattr->mattr.store = NULL; sattr->mattr.attr.name = sattr->name; @@ -979,7 +994,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, mod->sect_attrs = sect_attrs; return; out: - kfree(sect_attrs); + free_sect_attrs(sect_attrs); } static void remove_sect_attrs(struct module *mod) @@ -989,13 +1004,13 @@ static void remove_sect_attrs(struct module *mod) &mod->sect_attrs->grp); /* We are positive that no one is using any sect attrs * at this point. Deallocate immediately. */ - kfree(mod->sect_attrs); + free_sect_attrs(mod->sect_attrs); mod->sect_attrs = NULL; } } - #else + static inline void add_sect_attrs(struct module *mod, unsigned int nsect, char *sectstrings, Elf_Shdr *sechdrs) { @@ -1325,6 +1340,7 @@ static void set_license(struct module *mod, const char *license) printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", mod->name, license); add_taint(TAINT_PROPRIETARY_MODULE); + mod->taints |= TAINT_PROPRIETARY_MODULE; } } @@ -1604,6 +1620,7 @@ static struct module *load_module(void __user *umod, /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1697,10 +1714,14 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); - if (strcmp(mod->name, "ndiswrapper") == 0) + if (strcmp(mod->name, "ndiswrapper") == 0) { add_taint(TAINT_PROPRIETARY_MODULE); - if (strcmp(mod->name, "driverloader") == 0) + mod->taints |= TAINT_PROPRIETARY_MODULE; + } + if (strcmp(mod->name, "driverloader") == 0) { add_taint(TAINT_PROPRIETARY_MODULE); + mod->taints |= TAINT_PROPRIETARY_MODULE; + } /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); @@ -1746,6 +1767,7 @@ static struct module *load_module(void __user *umod, printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; } #endif @@ -2018,7 +2040,8 @@ const char *module_address_lookup(unsigned long addr, list_for_each_entry(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { - *modname = mod->name; + if (modname) + *modname = mod->name; return get_ksymbol(mod, addr, size, offset); } } @@ -2212,14 +2235,37 @@ struct module *module_text_address(unsigned long addr) return mod; } +static char *taint_flags(unsigned int taints, char *buf) +{ + *buf = '\0'; + if (taints) { + int bx; + + buf[0] = '('; + bx = 1; + if (taints & TAINT_PROPRIETARY_MODULE) + buf[bx++] = 'P'; + if (taints & TAINT_FORCED_MODULE) + buf[bx++] = 'F'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + buf[bx] = ')'; + } + return buf; +} + /* Don't grab lock, we're oopsing. */ void print_modules(void) { struct module *mod; + char buf[8]; printk("Modules linked in:"); list_for_each_entry(mod, &modules, list) - printk(" %s", mod->name); + printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); printk("\n"); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c new file mode 100644 index 000000000000..6ebdb82a0ce4 --- /dev/null +++ b/kernel/nsproxy.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Author: Serge Hallyn <serue@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * Jun 2006 - namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/nsproxy.h> +#include <linux/init_task.h> +#include <linux/namespace.h> +#include <linux/utsname.h> + +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); + +static inline void get_nsproxy(struct nsproxy *ns) +{ + atomic_inc(&ns->count); +} + +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + +/* + * creates a copy of "orig" with refcount 1. + * This does not grab references to the contained namespaces, + * so that needs to be done by dup_namespaces. + */ +static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns; + + ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); + if (ns) { + memcpy(ns, orig, sizeof(struct nsproxy)); + atomic_set(&ns->count, 1); + } + return ns; +} + +/* + * copies the nsproxy, setting refcount to 1, and grabbing a + * reference to all contained namespaces. Called from + * sys_unshare() + */ +struct nsproxy *dup_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns = clone_namespaces(orig); + + if (ns) { + if (ns->namespace) + get_namespace(ns->namespace); + if (ns->uts_ns) + get_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + get_ipc_ns(ns->ipc_ns); + } + + return ns; +} + +/* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. + */ +int copy_namespaces(int flags, struct task_struct *tsk) +{ + struct nsproxy *old_ns = tsk->nsproxy; + struct nsproxy *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_nsproxy(old_ns); + + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + return 0; + + new_ns = clone_namespaces(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->nsproxy = new_ns; + + err = copy_namespace(flags, tsk); + if (err) + goto out_ns; + + err = copy_utsname(flags, tsk); + if (err) + goto out_uts; + + err = copy_ipcs(flags, tsk); + if (err) + goto out_ipc; + +out: + put_nsproxy(old_ns); + return err; + +out_ipc: + if (new_ns->uts_ns) + put_uts_ns(new_ns->uts_ns); +out_uts: + if (new_ns->namespace) + put_namespace(new_ns->namespace); +out_ns: + tsk->nsproxy = old_ns; + kfree(new_ns); + goto out; +} + +void free_nsproxy(struct nsproxy *ns) +{ + if (ns->namespace) + put_namespace(ns->namespace); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + kfree(ns); +} diff --git a/kernel/panic.c b/kernel/panic.c index 8010b9b17aca..525e365f7239 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -270,3 +270,15 @@ void oops_exit(void) { do_oops_enter_exit(); } + +#ifdef CONFIG_CC_STACKPROTECTOR +/* + * Called when gcc's -fstack-protector feature is used, and + * gcc detects corruption of the on-stack canary value + */ +void __stack_chk_fail(void) +{ + panic("stack-protector: Kernel stack is corrupted"); +} +EXPORT_SYMBOL(__stack_chk_fail); +#endif diff --git a/kernel/params.c b/kernel/params.c index 91aea7aa532e..f406655d6653 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name, unsigned int name_skip) { struct module_kobject *mk; + int ret; mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); @@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name, mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); kobject_set_name(&mk->kobj, name); - kobject_register(&mk->kobj); + ret = kobject_register(&mk->kobj); + BUG_ON(ret < 0); /* no need to keep the kobject if no parameter is exported */ if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { @@ -684,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL); */ static int __init param_sysfs_init(void) { - subsystem_register(&module_subsys); + int ret; + + ret = subsystem_register(&module_subsys); + if (ret < 0) { + printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", + __FILE__, __LINE__, ret); + return ret; + } param_sysfs_builtin(); return 0; } -__initcall(param_sysfs_init); +subsys_initcall(param_sysfs_init); EXPORT_SYMBOL(param_set_byte); EXPORT_SYMBOL(param_get_byte); diff --git a/kernel/pid.c b/kernel/pid.c index 93e212f20671..b914392085f9 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/hash.h> +#include <linux/pspace.h> #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; @@ -33,17 +34,20 @@ static int pidhash_shift; static kmem_cache_t *pid_cachep; int pid_max = PID_MAX_DEFAULT; -int last_pid; #define RESERVED_PIDS 300 int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) + +static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) +{ + return (map - pspace->pidmap)*BITS_PER_PAGE + off; +} + #define find_next_offset(map, off) \ find_next_zero_bit((map)->page, BITS_PER_PAGE, off) @@ -53,13 +57,12 @@ int pid_max_max = PID_MAX_LIMIT; * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -typedef struct pidmap { - atomic_t nr_free; - void *page; -} pidmap_t; - -static pidmap_t pidmap_array[PIDMAP_ENTRIES] = - { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; +struct pspace init_pspace = { + .pidmap = { + [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } + }, + .last_pid = 0 +}; /* * Note: disable interrupts while the pidmap_lock is held as an @@ -74,40 +77,41 @@ static pidmap_t pidmap_array[PIDMAP_ENTRIES] = * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(int pid) +static fastcall void free_pidmap(struct pspace *pspace, int pid) { - pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); } -static int alloc_pidmap(void) +static int alloc_pidmap(struct pspace *pspace) { - int i, offset, max_scan, pid, last = last_pid; - pidmap_t *map; + int i, offset, max_scan, pid, last = pspace->last_pid; + struct pidmap *map; pid = last + 1; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; - map = &pidmap_array[pid/BITS_PER_PAGE]; + map = &pspace->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { - unsigned long page = get_zeroed_page(GFP_KERNEL); + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* * Free the page if someone raced with us * installing it: */ spin_lock_irq(&pidmap_lock); if (map->page) - free_page(page); + kfree(page); else - map->page = (void *)page; + map->page = page; spin_unlock_irq(&pidmap_lock); if (unlikely(!map->page)) break; @@ -116,11 +120,11 @@ static int alloc_pidmap(void) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - last_pid = pid; + pspace->last_pid = pid; return pid; } offset = find_next_offset(map, offset); - pid = mk_pid(map, offset); + pid = mk_pid(pspace, map, offset); /* * find_next_offset() found a bit, the pid from it * is in-bounds, and if we fell back to the last @@ -131,16 +135,34 @@ static int alloc_pidmap(void) (i != max_scan || pid < last || !((last+1) & BITS_PER_PAGE_MASK))); } - if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { + if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; offset = 0; } else { - map = &pidmap_array[0]; + map = &pspace->pidmap[0]; offset = RESERVED_PIDS; if (unlikely(last == offset)) break; } - pid = mk_pid(map, offset); + pid = mk_pid(pspace, map, offset); + } + return -1; +} + +static int next_pidmap(struct pspace *pspace, int last) +{ + int offset; + struct pidmap *map, *end; + + offset = (last + 1) & BITS_PER_PAGE_MASK; + map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pspace->pidmap[PIDMAP_ENTRIES]; + for (; map < end; map++, offset = 0) { + if (unlikely(!map->page)) + continue; + offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); + if (offset < BITS_PER_PAGE) + return mk_pid(pspace, map, offset); } return -1; } @@ -153,6 +175,7 @@ fastcall void put_pid(struct pid *pid) atomic_dec_and_test(&pid->count)) kmem_cache_free(pid_cachep, pid); } +EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { @@ -169,7 +192,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(pid->nr); + free_pidmap(&init_pspace, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -183,7 +206,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(); + nr = alloc_pidmap(&init_pspace); if (nr < 0) goto out_free; @@ -217,15 +240,13 @@ struct pid * fastcall find_pid(int nr) } return NULL; } +EXPORT_SYMBOL_GPL(find_pid); int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) { struct pid_link *link; struct pid *pid; - WARN_ON(!task->pid); /* to be removed soon */ - WARN_ON(!nr); /* to be removed soon */ - link = &task->pids[type]; link->pid = pid = find_pid(nr); hlist_add_head_rcu(&link->node, &pid->tasks[type]); @@ -252,6 +273,15 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type) free_pid(pid); } +/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ +void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, + enum pid_type type) +{ + new->pids[type].pid = old->pids[type].pid; + hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); + old->pids[type].pid = NULL; +} + struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; @@ -274,6 +304,15 @@ struct task_struct *find_task_by_pid_type(int type, int nr) EXPORT_SYMBOL(find_task_by_pid_type); +struct pid *get_task_pid(struct task_struct *task, enum pid_type type) +{ + struct pid *pid; + rcu_read_lock(); + pid = get_pid(task->pids[type].pid); + rcu_read_unlock(); + return pid; +} + struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; @@ -297,6 +336,26 @@ struct pid *find_get_pid(pid_t nr) } /* + * Used by proc to find the first pid that is greater then or equal to nr. + * + * If there is a pid at nr this function is exactly the same as find_pid. + */ +struct pid *find_ge_pid(int nr) +{ + struct pid *pid; + + do { + pid = find_pid(nr); + if (pid) + break; + nr = next_pidmap(&init_pspace, nr); + } while (nr > 0); + + return pid; +} +EXPORT_SYMBOL_GPL(find_get_pid); + +/* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. @@ -323,10 +382,10 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, pidmap_array->page); - atomic_dec(&pidmap_array->nr_free); + set_bit(0, init_pspace.pidmap[0].page); + atomic_dec(&init_pspace.pidmap[0].nr_free); pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index d38d9ec3276c..479b16b44f79 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1393,25 +1393,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } } -static long posix_cpu_clock_nanosleep_restart(struct restart_block *); - -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) +static int do_cpu_nanosleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct itimerspec *it) { - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; struct k_itimer timer; int error; /* - * Diagnose required errors first. - */ - if (CPUCLOCK_PERTHREAD(which_clock) && - (CPUCLOCK_PID(which_clock) == 0 || - CPUCLOCK_PID(which_clock) == current->pid)) - return -EINVAL; - - /* * Set up a temporary timer and then wait for it to go off. */ memset(&timer, 0, sizeof timer); @@ -1422,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, timer.it_process = current; if (!error) { static struct itimerspec zero_it; - struct itimerspec it = { .it_value = *rqtp, - .it_interval = {} }; + + memset(it, 0, sizeof *it); + it->it_value = *rqtp; spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, &it, NULL); + error = posix_cpu_timer_set(&timer, flags, it, NULL); if (error) { spin_unlock_irq(&timer.it_lock); return error; @@ -1454,49 +1443,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, * We were interrupted by a signal. */ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - posix_cpu_timer_set(&timer, 0, &zero_it, &it); + posix_cpu_timer_set(&timer, 0, &zero_it, it); spin_unlock_irq(&timer.it_lock); - if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { /* * It actually did fire already. */ return 0; } + error = -ERESTART_RESTARTBLOCK; + } + + return error; +} + +int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) +{ + struct restart_block *restart_block = + ¤t_thread_info()->restart_block; + struct itimerspec it; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == current->pid)) + return -EINVAL; + + error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + + if (flags & TIMER_ABSTIME) + return -ERESTARTNOHAND; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && !(flags & TIMER_ABSTIME) && - copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_clock_nanosleep_restart; - /* Caller already set restart_block->arg1 */ + restart_block->fn = posix_cpu_nsleep_restart; restart_block->arg0 = which_clock; restart_block->arg1 = (unsigned long) rmtp; restart_block->arg2 = rqtp->tv_sec; restart_block->arg3 = rqtp->tv_nsec; - - error = -ERESTART_RESTARTBLOCK; } - return error; } -static long -posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) +long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->arg0; struct timespec __user *rmtp; struct timespec t; + struct itimerspec it; + int error; rmtp = (struct timespec __user *) restart_block->arg1; t.tv_sec = restart_block->arg2; t.tv_nsec = restart_block->arg3; restart_block->fn = do_no_restart_syscall; - return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); + error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + /* + * Report back to the user the time still remaining. + */ + if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->fn = posix_cpu_nsleep_restart; + restart_block->arg0 = which_clock; + restart_block->arg1 = (unsigned long) rmtp; + restart_block->arg2 = t.tv_sec; + restart_block->arg3 = t.tv_nsec; + } + return error; + } @@ -1524,6 +1553,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags, { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); } +static long process_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} static int thread_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { @@ -1544,6 +1577,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags, { return -EINVAL; } +static long thread_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} static __init int init_posix_cpu_timers(void) { @@ -1553,6 +1590,7 @@ static __init int init_posix_cpu_timers(void) .clock_set = do_posix_clock_nosettime, .timer_create = process_cpu_timer_create, .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, }; struct k_clock thread = { .clock_getres = thread_cpu_clock_getres, @@ -1560,6 +1598,7 @@ static __init int init_posix_cpu_timers(void) .clock_set = do_posix_clock_nosettime, .timer_create = thread_cpu_timer_create, .nsleep = thread_cpu_nsleep, + .nsleep_restart = thread_cpu_nsleep_restart, }; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ac6dc8744429..9cbb5d1be06f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1,5 +1,5 @@ /* - * linux/kernel/posix_timers.c + * linux/kernel/posix-timers.c * * * 2002-10-15 Posix Clocks & timers @@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags, return CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t, rmtp)); } + +/* + * nanosleep_restart for monotonic and realtime clocks + */ +static int common_nsleep_restart(struct restart_block *restart_block) +{ + return hrtimer_nanosleep_restart(restart_block); +} + +/* + * This will restart clock_nanosleep. This is required only by + * compat_clock_nanosleep_restart for now. + */ +long +clock_nanosleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->arg0; + + return CLOCK_DISPATCH(which_clock, nsleep_restart, + (restart_block)); +} diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4b6e2f18e056..825068ca3479 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -64,6 +64,17 @@ config PM_TRACE CAUTION: this option will cause your machine's real-time clock to be set to an invalid time after a resume. +config PM_SYSFS_DEPRECATED + bool "Driver model /sys/devices/.../power/state files (DEPRECATED)" + depends on PM && SYSFS + default n + help + The driver model started out with a sysfs file intended to provide + a userspace hook for device power management. This feature has never + worked very well, except for limited testing purposes, and so it will + be removed. It's not clear that a generic mechanism could really + handle the wide variability of device power states; any replacements + are likely to be bus or driver specific. config SOFTWARE_SUSPEND bool "Software Suspend" diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 7c7b9b65e365..d72234942798 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -103,7 +103,7 @@ static void unprepare_processes(void) } /** - * pm_suspend_disk - The granpappy of power management. + * pm_suspend_disk - The granpappy of hibernation power management. * * If we're going through the firmware, then get it over with quickly. * @@ -212,7 +212,7 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); - if ((error = device_suspend(PMSG_FREEZE))) { + if ((error = device_suspend(PMSG_PRETHAW))) { printk("Some devices failed to suspend\n"); swsusp_free(); goto Thaw; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1b84313cbab5..99f9b7d177d6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -906,7 +906,7 @@ static void init_header(struct swsusp_info *info) memset(info, 0, sizeof(struct swsusp_info)); info->version_code = LINUX_VERSION_CODE; info->num_physpages = num_physpages; - memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); info->cpus = num_online_cpus(); info->image_pages = nr_copy_pages; info->pages = nr_copy_pages + nr_meta_pages + 1; @@ -1050,13 +1050,13 @@ static inline int check_header(struct swsusp_info *info) reason = "kernel version"; if (info->num_physpages != num_physpages) reason = "memory size"; - if (strcmp(info->uts.sysname,system_utsname.sysname)) + if (strcmp(info->uts.sysname,init_utsname()->sysname)) reason = "system type"; - if (strcmp(info->uts.release,system_utsname.release)) + if (strcmp(info->uts.release,init_utsname()->release)) reason = "kernel release"; - if (strcmp(info->uts.version,system_utsname.version)) + if (strcmp(info->uts.version,init_utsname()->version)) reason = "version"; - if (strcmp(info->uts.machine,system_utsname.machine)) + if (strcmp(info->uts.machine,init_utsname()->machine)) reason = "machine"; if (reason) { printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 8ef677ea0cea..0b66659dc516 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -247,6 +247,9 @@ int swsusp_suspend(void) restore_processor_state(); Restore_highmem: restore_highmem(); + /* NOTE: device_power_up() is just a resume() for devices + * that suspended with irqs off ... no overall powerup. + */ device_power_up(); Enable_irqs: local_irq_enable(); @@ -256,8 +259,12 @@ Enable_irqs: int swsusp_resume(void) { int error; + local_irq_disable(); - if (device_power_down(PMSG_FREEZE)) + /* NOTE: device_power_down() is just a suspend() with irqs off; + * it has no special "power things down" semantics + */ + if (device_power_down(PMSG_PRETHAW)) printk(KERN_ERR "Some devices failed to power down, very bad\n"); /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 2e4499f3e4d9..72825c853cd7 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -196,7 +196,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, snapshot_free_unused_memory(&data->handle); down(&pm_sem); pm_prepare_console(); - error = device_suspend(PMSG_FREEZE); + error = device_suspend(PMSG_PRETHAW); if (!error) { error = swsusp_resume(); device_resume(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9a111f70145c..4d50e06fd745 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data) return 0; } -/* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages - */ - -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) -{ - struct mm_struct *mm; - struct vm_area_struct *vma; - struct page *page; - void *old_buf = buf; - - mm = get_task_mm(tsk); - if (!mm) - return 0; - - down_read(&mm->mmap_sem); - /* ignore errors, just check how much was sucessfully transfered */ - while (len) { - int bytes, ret, offset; - void *maddr; - - ret = get_user_pages(tsk, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); - } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); - } - kunmap(page); - page_cache_release(page); - len -= bytes; - buf += bytes; - addr += bytes; - } - up_read(&mm->mmap_sem); - mmput(mm); - - return buf - old_buf; -} - int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) { int copied = 0; @@ -494,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) child = find_task_by_pid(pid); if (child) get_task_struct(child); + read_unlock(&tasklist_lock); if (!child) return ERR_PTR(-ESRCH); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 523e46483b99..26bb5ffe1ef1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -71,9 +71,6 @@ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int blimit = 10; static int qhimark = 10000; static int qlowmark = 100; -#ifdef CONFIG_SMP -static int rsinterval = 1000; -#endif static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -86,8 +83,8 @@ static void force_quiescent_state(struct rcu_data *rdp, int cpu; cpumask_t cpumask; set_need_resched(); - if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { - rdp->last_rs_qlen = rdp->qlen; + if (unlikely(!rcp->signaled)) { + rcp->signaled = 1; /* * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. @@ -301,6 +298,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) smp_mb(); cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + rcp->signaled = 0; } } @@ -628,9 +626,6 @@ void synchronize_rcu(void) module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -#ifdef CONFIG_SMP -module_param(rsinterval, int, 0); -#endif EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); EXPORT_SYMBOL_GPL(call_rcu); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 4d1c3d247127..e2bda18f6f42 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -15,9 +15,10 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2005 + * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Josh Triplett <josh@freedesktop.org> * * See also: Documentation/RCU/torture.txt */ @@ -44,19 +45,25 @@ #include <linux/delay.h> #include <linux/byteorder/swabb.h> #include <linux/stat.h> +#include <linux/srcu.h> MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " + "Josh Triplett <josh@freedesktop.org>"); -static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ +static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ +static int nfakewriters = 4; /* # fake writer threads */ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ -static char *torture_type = "rcu"; /* What to torture. */ +static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +module_param(nfakewriters, int, 0); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); module_param(stat_interval, int, 0); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); module_param(verbose, bool, 0); @@ -66,7 +73,7 @@ MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); module_param(torture_type, charp, 0); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ @@ -80,6 +87,7 @@ static char printk_buf[4096]; static int nrealreaders; static struct task_struct *writer_task; +static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; @@ -104,11 +112,12 @@ static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -atomic_t n_rcu_torture_alloc; -atomic_t n_rcu_torture_alloc_fail; -atomic_t n_rcu_torture_free; -atomic_t n_rcu_torture_mberror; -atomic_t n_rcu_torture_error; +static atomic_t n_rcu_torture_alloc; +static atomic_t n_rcu_torture_alloc_fail; +static atomic_t n_rcu_torture_free; +static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_error; +static struct list_head rcu_torture_removed; /* * Allocate an element from the rcu_tortures pool. @@ -145,7 +154,7 @@ rcu_torture_free(struct rcu_torture *p) struct rcu_random_state { unsigned long rrs_state; - unsigned long rrs_count; + long rrs_count; }; #define RCU_RANDOM_MULT 39916801 /* prime */ @@ -158,7 +167,7 @@ struct rcu_random_state { * Crude but fast random-number generator. Uses a linear congruential * generator, with occasional help from get_random_bytes(). */ -static long +static unsigned long rcu_random(struct rcu_random_state *rrsp) { long refresh; @@ -180,9 +189,11 @@ struct rcu_torture_ops { void (*init)(void); void (*cleanup)(void); int (*readlock)(void); + void (*readdelay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); int (*completed)(void); void (*deferredfree)(struct rcu_torture *p); + void (*sync)(void); int (*stats)(char *page); char *name; }; @@ -192,13 +203,25 @@ static struct rcu_torture_ops *cur_ops = NULL; * Definitions for rcu torture testing. */ -static int rcu_torture_read_lock(void) +static int rcu_torture_read_lock(void) __acquires(RCU) { rcu_read_lock(); return 0; } -static void rcu_torture_read_unlock(int idx) +static void rcu_read_delay(struct rcu_random_state *rrsp) +{ + long delay; + const long longdelay = 200; + + /* We want there to be long-running readers, but not all the time. */ + + delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); + if (!delay) + udelay(longdelay); +} + +static void rcu_torture_read_unlock(int idx) __releases(RCU) { rcu_read_unlock(); } @@ -239,24 +262,65 @@ static struct rcu_torture_ops rcu_ops = { .init = NULL, .cleanup = NULL, .readlock = rcu_torture_read_lock, + .readdelay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, + .sync = synchronize_rcu, .stats = NULL, .name = "rcu" }; +static void rcu_sync_torture_deferred_free(struct rcu_torture *p) +{ + int i; + struct rcu_torture *rp; + struct rcu_torture *rp1; + + cur_ops->sync(); + list_add(&p->rtort_free, &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } +} + +static void rcu_sync_torture_init(void) +{ + INIT_LIST_HEAD(&rcu_torture_removed); +} + +static struct rcu_torture_ops rcu_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .readdelay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu, + .stats = NULL, + .name = "rcu_sync" +}; + /* * Definitions for rcu_bh torture testing. */ -static int rcu_bh_torture_read_lock(void) +static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) { rcu_read_lock_bh(); return 0; } -static void rcu_bh_torture_read_unlock(int idx) +static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) { rcu_read_unlock_bh(); } @@ -271,19 +335,176 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); } +struct rcu_bh_torture_synchronize { + struct rcu_head head; + struct completion completion; +}; + +static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) +{ + struct rcu_bh_torture_synchronize *rcu; + + rcu = container_of(head, struct rcu_bh_torture_synchronize, head); + complete(&rcu->completion); +} + +static void rcu_bh_torture_synchronize(void) +{ + struct rcu_bh_torture_synchronize rcu; + + init_completion(&rcu.completion); + call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); + wait_for_completion(&rcu.completion); +} + static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, .stats = NULL, .name = "rcu_bh" }; +static struct rcu_torture_ops rcu_bh_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .stats = NULL, + .name = "rcu_bh_sync" +}; + +/* + * Definitions for srcu torture testing. + */ + +static struct srcu_struct srcu_ctl; + +static void srcu_torture_init(void) +{ + init_srcu_struct(&srcu_ctl); + rcu_sync_torture_init(); +} + +static void srcu_torture_cleanup(void) +{ + synchronize_srcu(&srcu_ctl); + cleanup_srcu_struct(&srcu_ctl); +} + +static int srcu_torture_read_lock(void) +{ + return srcu_read_lock(&srcu_ctl); +} + +static void srcu_read_delay(struct rcu_random_state *rrsp) +{ + long delay; + const long uspertick = 1000000 / HZ; + const long longdelay = 10; + + /* We want there to be long-running readers, but not all the time. */ + + delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); + if (!delay) + schedule_timeout_interruptible(longdelay); +} + +static void srcu_torture_read_unlock(int idx) +{ + srcu_read_unlock(&srcu_ctl, idx); +} + +static int srcu_torture_completed(void) +{ + return srcu_batches_completed(&srcu_ctl); +} + +static void srcu_torture_synchronize(void) +{ + synchronize_srcu(&srcu_ctl); +} + +static int srcu_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + int idx = srcu_ctl.completed & 0x1; + + cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); + for_each_possible_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} + +static struct rcu_torture_ops srcu_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .readdelay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .stats = srcu_torture_stats, + .name = "srcu" +}; + +/* + * Definitions for sched torture testing. + */ + +static int sched_torture_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_torture_read_unlock(int idx) +{ + preempt_enable(); +} + +static int sched_torture_completed(void) +{ + return 0; +} + +static void sched_torture_synchronize(void) +{ + synchronize_sched(); +} + +static struct rcu_torture_ops sched_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .stats = NULL, + .name = "sched" +}; + static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_bh_ops, NULL }; + { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, + &sched_ops, NULL }; /* * RCU torture writer kthread. Repeatedly substitutes a new structure @@ -330,6 +551,30 @@ rcu_torture_writer(void *arg) } /* + * RCU torture fake writer kthread. Repeatedly calls sync, with a random + * delay between calls. + */ +static int +rcu_torture_fakewriter(void *arg) +{ + DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); + udelay(rcu_random(&rand) & 0x3ff); + cur_ops->sync(); + } while (!kthread_should_stop() && !fullstop); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The * counter in the element should never be greater than 1, otherwise, the @@ -359,7 +604,7 @@ rcu_torture_reader(void *arg) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - udelay(rcu_random(&rand) & 0x7f); + cur_ops->readdelay(&rand); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -483,7 +728,7 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. */ -void rcu_torture_shuffle_tasks(void) +static void rcu_torture_shuffle_tasks(void) { cpumask_t tmp_mask = CPU_MASK_ALL; int i; @@ -507,6 +752,12 @@ void rcu_torture_shuffle_tasks(void) set_cpus_allowed(reader_tasks[i], tmp_mask); } + if (fakewriter_tasks != NULL) { + for (i = 0; i < nfakewriters; i++) + if (fakewriter_tasks[i]) + set_cpus_allowed(fakewriter_tasks[i], tmp_mask); + } + if (writer_task) set_cpus_allowed(writer_task, tmp_mask); @@ -540,11 +791,12 @@ rcu_torture_shuffle(void *arg) static inline void rcu_torture_print_module_parms(char *tag) { - printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " + printk(KERN_ALERT "%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " "shuffle_interval = %d\n", - torture_type, tag, nrealreaders, stat_interval, verbose, - test_no_idle_hz, shuffle_interval); + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval); } static void @@ -579,6 +831,19 @@ rcu_torture_cleanup(void) } rcu_torture_current = NULL; + if (fakewriter_tasks != NULL) { + for (i = 0; i < nfakewriters; i++) { + if (fakewriter_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_fakewriter task"); + kthread_stop(fakewriter_tasks[i]); + } + fakewriter_tasks[i] = NULL; + } + kfree(fakewriter_tasks); + fakewriter_tasks = NULL; + } + if (stats_task != NULL) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); kthread_stop(stats_task); @@ -666,7 +931,25 @@ rcu_torture_init(void) writer_task = NULL; goto unwind; } - reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), + fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nfakewriters; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + "rcu_torture_fakewriter"); + if (IS_ERR(fakewriter_tasks[i])) { + firsterr = PTR_ERR(fakewriter_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); + fakewriter_tasks[i] = NULL; + goto unwind; + } + } + reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_PRINTK_ERRSTRING("out of memory"); diff --git a/kernel/relay.c b/kernel/relay.c index 33345e73485c..1d63ecddfa70 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) * @buf: the buffer struct * @size: total size of the buffer * - * Returns a pointer to the resulting buffer, NULL if unsuccessful. The + * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The * passed in size will get page aligned, if it isn't already. */ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) @@ -132,10 +132,9 @@ depopulate: /** * relay_create_buf - allocate and initialize a channel buffer - * @alloc_size: size of the buffer to allocate - * @n_subbufs: number of sub-buffers in the channel + * @chan: the relay channel * - * Returns channel buffer if successful, NULL otherwise + * Returns channel buffer if successful, %NULL otherwise. */ struct rchan_buf *relay_create_buf(struct rchan *chan) { @@ -163,6 +162,7 @@ free_buf: /** * relay_destroy_channel - free the channel struct + * @kref: target kernel reference that contains the relay channel * * Should only be called from kref_put(). */ @@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf) /** * relay_remove_buf - remove a channel buffer + * @kref: target kernel reference that contains the relay buffer * * Removes the file from the fileystem, which also frees the * rchan_buf_struct and the channel buffer. Should only be called from @@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_reset); -/** +/* * relay_open_buf - create a new relay channel buffer * * Internal - used by relay_open(). @@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan, /** * relay_open - create a new relay channel * @base_filename: base name of files to create - * @parent: dentry of parent directory, NULL for root directory + * @parent: dentry of parent directory, %NULL for root directory * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers * @cb: client callback functions * - * Returns channel pointer if successful, NULL otherwise. + * Returns channel pointer if successful, %NULL otherwise. * * Creates a channel buffer for each cpu using the sizes and * attributes specified. The created channel buffer files @@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf); * subbufs_consumed should be the number of sub-buffers newly consumed, * not the total consumed. * - * NOTE: kernel clients don't need to call this function if the channel + * NOTE: Kernel clients don't need to call this function if the channel * mode is 'overwrite'. */ void relay_subbufs_consumed(struct rchan *chan, @@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close); * relay_flush - close the channel * @chan: the channel * - * Flushes all channel buffers i.e. forces buffer switch. + * Flushes all channel buffers, i.e. forces buffer switch. */ void relay_flush(struct rchan *chan) { @@ -669,7 +670,7 @@ EXPORT_SYMBOL_GPL(relay_flush); */ static int relay_file_open(struct inode *inode, struct file *filp) { - struct rchan_buf *buf = inode->u.generic_ip; + struct rchan_buf *buf = inode->i_private; kref_get(&buf->kref); filp->private_data = buf; @@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp) return 0; } -/** +/* * relay_file_read_consume - update the consumed count for the buffer */ static void relay_file_read_consume(struct rchan_buf *buf, @@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf, } } -/** +/* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) @@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) /** * relay_file_read_subbuf_avail - return bytes available in sub-buffer + * @read_pos: file read position + * @buf: relay channel buffer */ static size_t relay_file_read_subbuf_avail(size_t read_pos, struct rchan_buf *buf) @@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read + * @read_pos: file read position + * @buf: relay channel buffer * * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise @@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos, /** * relay_file_read_end_pos - return the new read position + * @read_pos: file read position + * @buf: relay channel buffer + * @count: number of bytes to be read */ static size_t relay_file_read_end_pos(struct rchan_buf *buf, size_t read_pos, @@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, return end_pos; } -/** +/* * subbuf_read_actor - read up to one subbuf's worth of data */ static int subbuf_read_actor(size_t read_start, @@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start, return ret; } -/** +/* * subbuf_send_actor - send up to one subbuf's worth of data */ static int subbuf_send_actor(size_t read_start, @@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start, read_descriptor_t *desc, read_actor_t actor); -/** +/* * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries */ static inline ssize_t relay_file_read_subbufs(struct file *filp, diff --git a/kernel/resource.c b/kernel/resource.c index 46286434af80..6de60c12143e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -193,6 +193,13 @@ static int __release_resource(struct resource *old) return -EINVAL; } +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ int request_resource(struct resource *root, struct resource *new) { struct resource *conflict; @@ -205,6 +212,15 @@ int request_resource(struct resource *root, struct resource *new) EXPORT_SYMBOL(request_resource); +/** + * ____request_resource - reserve a resource, with resource conflict returned + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns: + * On success, NULL is returned. + * On error, a pointer to the conflicting resource is returned. + */ struct resource *____request_resource(struct resource *root, struct resource *new) { struct resource *conflict; @@ -217,6 +233,10 @@ struct resource *____request_resource(struct resource *root, struct resource *ne EXPORT_SYMBOL(____request_resource); +/** + * release_resource - release a previously reserved resource + * @old: resource pointer + */ int release_resource(struct resource *old) { int retval; @@ -315,8 +335,16 @@ static int find_resource(struct resource *root, struct resource *new, return -EBUSY; } -/* - * Allocate empty slot in the resource tree given range and alignment. +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * @size: requested resource region size + * @min: minimum size to allocate + * @max: maximum size to allocate + * @align: alignment requested, in bytes + * @alignf: alignment function, optional, called if not NULL + * @alignf_data: arbitrary data to pass to the @alignf function */ int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, @@ -344,12 +372,11 @@ EXPORT_SYMBOL(allocate_resource); * * Returns 0 on success, -EBUSY if the resource can't be inserted. * - * This function is equivalent of request_resource when no conflict + * This function is equivalent to request_resource when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become childs of - * the new resource. Otherwise the new resource becomes the child of - * the conflicting resource + * resource is inserted and the conflicting resources become children of + * the new resource. */ int insert_resource(struct resource *parent, struct resource *new) { @@ -357,20 +384,21 @@ int insert_resource(struct resource *parent, struct resource *new) struct resource *first, *next; write_lock(&resource_lock); - begin: - result = 0; - first = __request_resource(parent, new); - if (!first) - goto out; - result = -EBUSY; - if (first == parent) - goto out; + for (;; parent = first) { + result = 0; + first = __request_resource(parent, new); + if (!first) + goto out; - /* Resource fully contained by the clashing resource? Recurse into it */ - if (first->start <= new->start && first->end >= new->end) { - parent = first; - goto begin; + result = -EBUSY; + if (first == parent) + goto out; + + if ((first->start > new->start) || (first->end < new->end)) + break; + if ((first->start == new->start) && (first->end == new->end)) + break; } for (next = first; ; next = next->sibling) { @@ -407,10 +435,15 @@ int insert_resource(struct resource *parent, struct resource *new) return result; } -/* +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * * Given an existing resource, change its start and size to match the - * arguments. Returns -EBUSY if it can't fit. Existing children of - * the resource are assumed to be immutable. + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. */ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { @@ -456,11 +489,19 @@ EXPORT_SYMBOL(adjust_resource); * Note how this, unlike the above, knows about * the IO flag meanings (busy etc). * - * Request-region creates a new busy region. + * request_region creates a new busy region. * - * Check-region returns non-zero if the area is already busy + * check_region returns non-zero if the area is already busy. * - * Release-region releases a matching busy region. + * release_region releases a matching busy region. + */ + +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, @@ -497,9 +538,23 @@ struct resource * __request_region(struct resource *parent, } return res; } - EXPORT_SYMBOL(__request_region); +/** + * __check_region - check if a resource region is busy or free + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * Returns 0 if the region is free at the moment it is checked, + * returns %-EBUSY if the region is busy. + * + * NOTE: + * This function is deprecated because its use is racy. + * Even if it returns 0, a subsequent call to request_region() + * may fail because another driver etc. just allocated the region. + * Do NOT use it. It will be removed from the kernel. + */ int __check_region(struct resource *parent, resource_size_t start, resource_size_t n) { @@ -513,9 +568,16 @@ int __check_region(struct resource *parent, resource_size_t start, kfree(res); return 0; } - EXPORT_SYMBOL(__check_region); +/** + * __release_region - release a previously reserved resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * The described resource region must match a currently busy region. + */ void __release_region(struct resource *parent, resource_size_t start, resource_size_t n) { @@ -553,7 +615,6 @@ void __release_region(struct resource *parent, resource_size_t start, "<%016llx-%016llx>\n", (unsigned long long)start, (unsigned long long)end); } - EXPORT_SYMBOL(__release_region); /* diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 0c1faa950af7..da8d6bf46457 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -16,7 +16,6 @@ * * See rt.c in preempt-rt for proper credits and further information */ -#include <linux/config.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/module.h> diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 948bd8f643e2..6dcea9dd8c94 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -6,7 +6,6 @@ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * */ -#include <linux/config.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/sched.h> diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 3e13a1e5856f..4ab17da46fd8 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Grab the next task */ task = rt_mutex_owner(lock); + get_task_struct(task); spin_lock_irqsave(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { @@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } - get_task_struct(task); spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); @@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; unsigned long flags; - int boost = 0, res; + int chain_walk = 0, res; spin_lock_irqsave(¤t->pi_lock, flags); __rt_mutex_adjust_prio(current); @@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } - spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { - spin_lock_irqsave(&owner->pi_lock, flags); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } + if (owner->pi_blocked_on) + chain_walk = 1; spin_unlock_irqrestore(&owner->pi_lock, flags); } - if (!boost) + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + chain_walk = 1; + + if (!chain_walk) return 0; + /* + * The owner can't disappear while holding a lock, + * so the owner struct is protected by wait_lock. + * Gets dropped in rt_mutex_adjust_prio_chain()! + */ + get_task_struct(owner); + spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, @@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock, int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); unsigned long flags; - int boost = 0; + int chain_walk = 0; spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); @@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } + if (owner->pi_blocked_on) + chain_walk = 1; + spin_unlock_irqrestore(&owner->pi_lock, flags); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - if (!boost) + if (!chain_walk) return; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + spin_unlock(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); @@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task) return; } - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); spin_unlock_irqrestore(&task->pi_lock, flags); + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } diff --git a/kernel/sched.c b/kernel/sched.c index 5c848fd4e461..53608a59d6e3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -49,7 +49,7 @@ #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/times.h> -#include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> #include <asm/tlb.h> @@ -1232,7 +1232,7 @@ nextgroup: } /* - * find_idlest_queue - find the idlest runqueue among the cpus in group. + * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) @@ -1286,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag) while (sd) { cpumask_t span; struct sched_group *group; - int new_cpu; - int weight; + int new_cpu, weight; + + if (!(sd->flags & flag)) { + sd = sd->child; + continue; + } span = sd->span; group = find_idlest_group(sd, t, cpu); - if (!group) - goto nextlevel; + if (!group) { + sd = sd->child; + continue; + } new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) - goto nextlevel; + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } - /* Now try balancing at a lower domain level */ + /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; -nextlevel: sd = NULL; weight = cpus_weight(span); for_each_domain(cpu, tmp) { @@ -1755,27 +1763,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; - unsigned long prev_task_flags; + long prev_state; rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". - * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and - * calls schedule one last time. The schedule call will never return, - * and the scheduled task must drop that reference. - * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * The test for TASK_DEAD must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. * Manfred Spraul <manfred@colorfullife.com> */ - prev_task_flags = prev->flags; + prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); if (mm) mmdrop(mm); - if (unlikely(prev_task_flags & PF_DEAD)) { + if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -2533,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); @@ -2630,7 +2644,7 @@ redo: } if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return nr_moved; @@ -2646,7 +2660,7 @@ out_one_pinned: sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return 0; } @@ -2668,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int sd_idle = 0; cpumask_t cpus = CPU_MASK_ALL; - if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ + if (sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); @@ -2709,7 +2730,8 @@ redo: if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; } else sd->nr_balance_failed = 0; @@ -2719,7 +2741,7 @@ redo: out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; sd->nr_balance_failed = 0; @@ -3348,9 +3370,6 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); - if (unlikely(prev->flags & PF_DEAD)) - prev->state = EXIT_DEAD; - switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; @@ -4080,6 +4099,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. + * + * NOTE: the task may be already dead */ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) @@ -4107,28 +4128,32 @@ recheck: (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) - != (param->sched_priority == 0)) + if (is_rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - /* - * can't change policy, except between SCHED_NORMAL - * and SCHED_BATCH: - */ - if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && - (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && - !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) - return -EPERM; - /* can't increase priority */ - if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && - param->sched_priority > p->rt_priority && - param->sched_priority > - p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) - return -EPERM; + if (is_rt_policy(policy)) { + unsigned long rlim_rtprio; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &flags); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } + /* can't change other user's priorities */ if ((current->euid != p->euid) && (current->euid != p->uid)) @@ -4193,14 +4218,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - read_lock_irq(&tasklist_lock); + + rcu_read_lock(); + retval = -ESRCH; p = find_process_by_pid(pid); - if (!p) { - read_unlock_irq(&tasklist_lock); - return -ESRCH; - } - retval = sched_setscheduler(p, policy, &lparam); - read_unlock_irq(&tasklist_lock); + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); + rcu_read_unlock(); return retval; } @@ -4382,7 +4406,10 @@ EXPORT_SYMBOL(cpu_present_map); #ifndef CONFIG_SMP cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); #endif long sched_getaffinity(pid_t pid, cpumask_t *mask) @@ -4812,7 +4839,7 @@ void show_state(void) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __devinit init_idle(struct task_struct *idle, int cpu) +void __cpuinit init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -5151,7 +5178,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->flags & PF_DEAD); + BUG_ON(p->state == TASK_DEAD); get_task_struct(p); @@ -5272,9 +5299,11 @@ static struct notifier_block __cpuinitdata migration_notifier = { int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); + int err; /* Start one for the boot CPU: */ - migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); @@ -5385,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd) if (sd->flags & (SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC)) { + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5419,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC); + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); } if (~cflags & pflags) return 0; @@ -5441,12 +5474,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) struct sched_domain *parent = tmp->parent; if (!parent) break; - if (sd_parent_degenerate(tmp, parent)) + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + } } - if (sd && sd_degenerate(sd)) + if (sd && sd_degenerate(sd)) { sd = sd->parent; + if (sd) + sd->child = NULL; + } sched_domain_debug(sd, cpu); @@ -5454,7 +5493,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -5482,15 +5521,17 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, - int (*group_fn)(int cpu)) +static void +init_sched_build_groups(struct sched_group groups[], cpumask_t span, + const cpumask_t *cpu_map, + int (*group_fn)(int cpu, const cpumask_t *cpu_map)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; int i; for_each_cpu_mask(i, span) { - int group = group_fn(i); + int group = group_fn(i, cpu_map); struct sched_group *sg = &groups[group]; int j; @@ -5501,7 +5542,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, sg->cpu_power = 0; for_each_cpu_mask(j, span) { - if (group_fn(j) != group) + if (group_fn(j, cpu_map) != group) continue; cpu_set(j, covered); @@ -5968,13 +6009,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) #endif ); if (system_state == SYSTEM_BOOTING) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); + if (num_online_cpus() > 1) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); + } + printk("\n"); } - printk("\n"); } j1 = jiffies; if (migration_debug) @@ -6077,7 +6120,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; -static int cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) { return cpu; } @@ -6088,31 +6131,36 @@ static int cpu_to_cpu_group(int cpu) */ #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group *sched_group_core_bycpu[NR_CPUS]; +static struct sched_group sched_group_core[NR_CPUS]; #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int cpu_to_core_group(int cpu) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) { - return first_cpu(cpu_sibling_map[cpu]); + cpumask_t mask = cpu_sibling_map[cpu]; + cpus_and(mask, mask, *cpu_map); + return first_cpu(mask); } #elif defined(CONFIG_SCHED_MC) -static int cpu_to_core_group(int cpu) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) { return cpu; } #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; -static int cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) { #ifdef CONFIG_SCHED_MC cpumask_t mask = cpu_coregroup_map(cpu); + cpus_and(mask, mask, *cpu_map); return first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) - return first_cpu(cpu_sibling_map[cpu]); + cpumask_t mask = cpu_sibling_map[cpu]; + cpus_and(mask, mask, *cpu_map); + return first_cpu(mask); #else return cpu; #endif @@ -6130,7 +6178,7 @@ static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; -static int cpu_to_allnodes_group(int cpu) +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) { return cpu_to_node(cpu); } @@ -6162,12 +6210,11 @@ next_sg: } #endif +#ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ static void free_sched_groups(const cpumask_t *cpu_map) { - int cpu; -#ifdef CONFIG_NUMA - int i; + int cpu, i; for_each_cpu_mask(cpu, *cpu_map) { struct sched_group *sched_group_allnodes @@ -6204,19 +6251,63 @@ next_sg: kfree(sched_group_nodes); sched_group_nodes_bycpu[cpu] = NULL; } +} +#else +static void free_sched_groups(const cpumask_t *cpu_map) +{ +} #endif - for_each_cpu_mask(cpu, *cpu_map) { - if (sched_group_phys_bycpu[cpu]) { - kfree(sched_group_phys_bycpu[cpu]); - sched_group_phys_bycpu[cpu] = NULL; - } -#ifdef CONFIG_SCHED_MC - if (sched_group_core_bycpu[cpu]) { - kfree(sched_group_core_bycpu[cpu]); - sched_group_core_bycpu[cpu] = NULL; - } -#endif + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + * + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents + * the maximum number of tasks a group can handle in the presence of other idle + * or lightly loaded groups in the same sched domain. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_domain *child; + struct sched_group *group; + + WARN_ON(!sd || !sd->groups); + + if (cpu != first_cpu(sd->groups->cpumask)) + return; + + child = sd->child; + + /* + * For perf policy, if the groups in child domain share resources + * (for example cores sharing some portions of the cache hierarchy + * or SMT), then set this domain groups cpu_power such that each group + * can handle only one task, when there are other idle groups in the + * same sched domain. + */ + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && + (child->flags & + (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { + sd->groups->cpu_power = SCHED_LOAD_SCALE; + return; } + + sd->groups->cpu_power = 0; + + /* + * add cpu_power of each child group to this groups cpu_power + */ + group = child->groups; + do { + sd->groups->cpu_power += group->cpu_power; + group = group->next; + } while (group != child->groups); } /* @@ -6226,10 +6317,7 @@ next_sg: static int build_sched_domains(const cpumask_t *cpu_map) { int i; - struct sched_group *sched_group_phys = NULL; -#ifdef CONFIG_SCHED_MC - struct sched_group *sched_group_core = NULL; -#endif + struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; struct sched_group *sched_group_allnodes = NULL; @@ -6261,9 +6349,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { if (!sched_group_allnodes) { sched_group_allnodes - = kmalloc(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL); + = kmalloc_node(sizeof(struct sched_group) + * MAX_NUMNODES, + GFP_KERNEL, + cpu_to_node(i)); if (!sched_group_allnodes) { printk(KERN_WARNING "Can not alloc allnodes sched group\n"); @@ -6275,7 +6364,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; - group = cpu_to_allnodes_group(i); + group = cpu_to_allnodes_group(i, cpu_map); sd->groups = &sched_group_allnodes[group]; p = sd; } else @@ -6285,60 +6374,42 @@ static int build_sched_domains(const cpumask_t *cpu_map) *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; + if (p) + p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); #endif - if (!sched_group_phys) { - sched_group_phys - = kmalloc(sizeof(struct sched_group) * NR_CPUS, - GFP_KERNEL); - if (!sched_group_phys) { - printk (KERN_WARNING "Can not alloc phys sched" - "group\n"); - goto error; - } - sched_group_phys_bycpu[i] = sched_group_phys; - } - p = sd; sd = &per_cpu(phys_domains, i); - group = cpu_to_phys_group(i); + group = cpu_to_phys_group(i, cpu_map); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; + if (p) + p->child = sd; sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_MC - if (!sched_group_core) { - sched_group_core - = kmalloc(sizeof(struct sched_group) * NR_CPUS, - GFP_KERNEL); - if (!sched_group_core) { - printk (KERN_WARNING "Can not alloc core sched" - "group\n"); - goto error; - } - sched_group_core_bycpu[i] = sched_group_core; - } - p = sd; sd = &per_cpu(core_domains, i); - group = cpu_to_core_group(i); + group = cpu_to_core_group(i, cpu_map); *sd = SD_MC_INIT; sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + p->child = sd; sd->groups = &sched_group_core[group]; #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - group = cpu_to_cpu_group(i); + group = cpu_to_cpu_group(i, cpu_map); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + p->child = sd; sd->groups = &sched_group_cpus[group]; #endif } @@ -6352,7 +6423,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) continue; init_sched_build_groups(sched_group_cpus, this_sibling_map, - &cpu_to_cpu_group); + cpu_map, &cpu_to_cpu_group); } #endif @@ -6364,7 +6435,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_core_map)) continue; init_sched_build_groups(sched_group_core, this_core_map, - &cpu_to_core_group); + cpu_map, &cpu_to_core_group); } #endif @@ -6378,14 +6449,14 @@ static int build_sched_domains(const cpumask_t *cpu_map) continue; init_sched_build_groups(sched_group_phys, nodemask, - &cpu_to_phys_group); + cpu_map, &cpu_to_phys_group); } #ifdef CONFIG_NUMA /* Set up node groups */ if (sched_group_allnodes) init_sched_build_groups(sched_group_allnodes, *cpu_map, - &cpu_to_allnodes_group); + cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6457,72 +6528,20 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; sd = &per_cpu(cpu_domains, i); - sd->groups->cpu_power = SCHED_LOAD_SCALE; + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - int power; - struct sched_domain *sd; sd = &per_cpu(core_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) - * SCHED_LOAD_SCALE / 10; - sd->groups->cpu_power = power; + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; -#ifdef CONFIG_SCHED_MC - sd = &per_cpu(phys_domains, i); - if (i != first_cpu(sd->groups->cpumask)) - continue; - - sd->groups->cpu_power = 0; - if (sched_mc_power_savings || sched_smt_power_savings) { - int j; - - for_each_cpu_mask(j, sd->groups->cpumask) { - struct sched_domain *sd1; - sd1 = &per_cpu(core_domains, j); - /* - * for each core we will add once - * to the group in physical domain - */ - if (j != first_cpu(sd1->groups->cpumask)) - continue; - - if (sched_smt_power_savings) - sd->groups->cpu_power += sd1->groups->cpu_power; - else - sd->groups->cpu_power += SCHED_LOAD_SCALE; - } - } else - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; -#else - int power; sd = &per_cpu(phys_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; -#endif + init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA @@ -6530,7 +6549,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) init_numa_sched_groups_power(sched_group_nodes[i]); if (sched_group_allnodes) { - int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); + int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); struct sched_group *sg = &sched_group_allnodes[group]; init_numa_sched_groups_power(sg); @@ -6556,9 +6575,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) return 0; +#ifdef CONFIG_NUMA error: free_sched_groups(cpu_map); return -ENOMEM; +#endif } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. @@ -6740,11 +6761,20 @@ static int update_sched_domains(struct notifier_block *nfb, void __init sched_init_smp(void) { + cpumask_t non_isolated_cpus; + lock_cpu_hotplug(); arch_init_sched_domains(&cpu_online_map); + cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map); + if (cpus_empty(non_isolated_cpus)) + cpu_set(smp_processor_id(), non_isolated_cpus); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed(current, non_isolated_cpus) < 0) + BUG(); } #else void __init sched_init_smp(void) diff --git a/kernel/signal.c b/kernel/signal.c index bfdb5686fa3e..7ed8d5304bec 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -417,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info) { - int sig = 0; + int sig = next_signal(pending, mask); - sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { @@ -432,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, if (!collect_signal(sig, pending, info)) sig = 0; - } - recalc_sigpending(); return sig; } @@ -451,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!signr) signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); + recalc_sigpending_tsk(tsk); if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our @@ -1057,28 +1055,44 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) } /* - * kill_pg_info() sends a signal to a process group: this is what the tty + * kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) */ -int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int retval, success; - if (pgrp <= 0) - return -EINVAL; - success = 0; retval = -ESRCH; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p); success |= !err; retval = err; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return success ? 0 : retval; } +int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) +{ + int retval; + + read_lock(&tasklist_lock); + retval = __kill_pgrp_info(sig, info, pgrp); + read_unlock(&tasklist_lock); + + return retval; +} + +int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +{ + if (pgrp <= 0) + return -EINVAL; + + return __kill_pgrp_info(sig, info, find_pid(pgrp)); +} + int kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { @@ -1091,8 +1105,7 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) return retval; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { int error; int acquired_tasklist_lock = 0; @@ -1103,7 +1116,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) read_lock(&tasklist_lock); acquired_tasklist_lock = 1; } - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); @@ -1113,8 +1126,18 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -/* like kill_proc_info(), but doesn't use uid/euid of "current" */ -int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, +int +kill_proc_info(int sig, struct siginfo *info, pid_t pid) +{ + int error; + rcu_read_lock(); + error = kill_pid_info(sig, info, find_pid(pid)); + rcu_read_unlock(); + return error; +} + +/* like kill_pid_info(), but doesn't use uid/euid of "current" */ +int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, uid_t uid, uid_t euid, u32 secid) { int ret = -EINVAL; @@ -1124,7 +1147,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, return ret; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; @@ -1148,7 +1171,7 @@ out_unlock: read_unlock(&tasklist_lock); return ret; } -EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); +EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1266,6 +1289,18 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pgrp); + +int kill_pid(struct pid *pid, int sig, int priv) +{ + return kill_pid_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pid); + int kill_pg(pid_t pgrp, int sig, int priv) { @@ -2577,6 +2612,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) } #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + void __init signals_init(void) { sigqueue_cachep = diff --git a/kernel/softirq.c b/kernel/softirq.c index 3789ca98197c..bf25015dce16 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -612,7 +612,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = { __init int spawn_ksoftirqd(void) { void *cpu = (void *)(long)smp_processor_id(); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + + BUG_ON(err == NOTIFY_BAD); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 03e6a2b0b787..50afeb813305 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = { __init void spawn_softlockup_task(void) { void *cpu = (void *)(long)smp_processor_id(); + int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index fb524b009eef..476c3741511b 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -7,6 +7,11 @@ * * This file contains the spinlock/rwlock implementations for the * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) + * + * Note that some architectures have special knowledge about the + * stack frames of these functions in their profile_pc. If you + * change anything significant here that could change the stack + * frame contact the architecture maintainers. */ #include <linux/linkage.h> @@ -16,17 +21,6 @@ #include <linux/debug_locks.h> #include <linux/module.h> -/* - * Generic declaration of the raw read_trylock() function, - * architectures are supposed to optimize this: - */ -int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) -{ - __raw_read_lock(lock); - return 1; -} -EXPORT_SYMBOL(generic__raw_read_trylock); - int __lockfunc _spin_trylock(spinlock_t *lock) { preempt_disable(); @@ -221,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - cpu_relax(); \ + _raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ @@ -243,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - cpu_relax(); \ + _raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ diff --git a/kernel/srcu.c b/kernel/srcu.c new file mode 100644 index 000000000000..3507cabe963b --- /dev/null +++ b/kernel/srcu.c @@ -0,0 +1,258 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/srcu.h> + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + sp->completed = 0; + mutex_init(&sp->mutex); + sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + return (sp->per_cpu_ref ? 0 : -ENOMEM); +} + +/* + * srcu_readers_active_idx -- returns approximate number of readers + * active on the specified rank of per-CPU counters. + */ + +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + int sum; + + sum = 0; + for_each_possible_cpu(cpu) + sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; + return sum; +} + +/** + * srcu_readers_active - returns approximate number of readers. + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +int srcu_readers_active(struct srcu_struct *sp) +{ + return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); +} + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + int sum; + + sum = srcu_readers_active(sp); + WARN_ON(sum); /* Leakage unless caller handles error. */ + if (sum != 0) + return; + free_percpu(sp->per_cpu_ref); + sp->per_cpu_ref = NULL; +} + +/** + * srcu_read_lock - register a new reader for an SRCU-protected structure. + * @sp: srcu_struct in which to register the new reader. + * + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + preempt_disable(); + idx = sp->completed & 0x1; + barrier(); /* ensure compiler looks -once- at sp->completed. */ + per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; + srcu_barrier(); /* ensure compiler won't misorder critical section. */ + preempt_enable(); + return idx; +} + +/** + * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. + * @sp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock(). + * + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + * Must be called from process context. + */ +void srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + preempt_disable(); + srcu_barrier(); /* ensure compiler won't misorder critical section. */ + per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; + preempt_enable(); +} + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchornize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + int idx; + + idx = sp->completed; + mutex_lock(&sp->mutex); + + /* + * Check to see if someone else did the work for us while we were + * waiting to acquire the lock. We need -two- advances of + * the counter, not just one. If there was but one, we might have + * shown up -after- our helper's first synchronize_sched(), thus + * having failed to prevent CPU-reordering races with concurrent + * srcu_read_unlock()s on other CPUs (see comment below). So we + * either (1) wait for two or (2) supply the second ourselves. + */ + + if ((sp->completed - idx) >= 2) { + mutex_unlock(&sp->mutex); + return; + } + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * The preceding synchronize_sched() ensures that any CPU that + * sees the new value of sp->completed will also see any preceding + * changes to data structures made by this CPU. This prevents + * some other CPU from reordering the accesses in its SRCU + * read-side critical section to precede the corresponding + * srcu_read_lock() -- ensuring that such references will in + * fact be protected. + * + * So it is now safe to do the flip. + */ + + idx = sp->completed & 0x1; + sp->completed++; + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * At this point, because of the preceding synchronize_sched(), + * all srcu_read_lock() calls using the old counters have completed. + * Their corresponding critical sections might well be still + * executing, but the srcu_read_lock() primitives themselves + * will have finished executing. + */ + + while (srcu_readers_active_idx(sp, idx)) + schedule_timeout_interruptible(1); + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * The preceding synchronize_sched() forces all srcu_read_unlock() + * primitives that were executing concurrently with the preceding + * for_each_possible_cpu() loop to have completed by this point. + * More importantly, it also forces the corresponding SRCU read-side + * critical sections to have also completed, and the corresponding + * references to SRCU-protected data items to be dropped. + * + * Note: + * + * Despite what you might think at first glance, the + * preceding synchronize_sched() -must- be within the + * critical section ended by the following mutex_unlock(). + * Otherwise, a task taking the early exit can race + * with a srcu_read_unlock(), which might have executed + * just before the preceding srcu_readers_active() check, + * and whose CPU might have reordered the srcu_read_unlock() + * with the preceding critical section. In this case, there + * is nothing preventing the synchronize_sched() task that is + * taking the early exit from freeing a data structure that + * is still being referenced (out of order) by the task + * doing the srcu_read_unlock(). + * + * Alternatively, the comparison with "2" on the early exit + * could be changed to "3", but this increases synchronize_srcu() + * latency for bulk loads. So the current code is preferred. + */ + + mutex_unlock(&sp->mutex); +} + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ + +long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->completed; +} + +EXPORT_SYMBOL_GPL(init_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(srcu_read_lock); +EXPORT_SYMBOL_GPL(srcu_read_unlock); +EXPORT_SYMBOL_GPL(synchronize_srcu); +EXPORT_SYMBOL_GPL(srcu_batches_completed); +EXPORT_SYMBOL_GPL(srcu_readers_active); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 51cacd111dbd..12458040e665 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,3 +1,6 @@ +/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. + * GPL v2 and any later version. + */ #include <linux/stop_machine.h> #include <linux/kthread.h> #include <linux/sched.h> diff --git a/kernel/sys.c b/kernel/sys.c index e236f98f7ec5..98489d82801b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -28,6 +28,7 @@ #include <linux/tty.h> #include <linux/signal.h> #include <linux/cn_proc.h> +#include <linux/getcpu.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -91,7 +92,8 @@ EXPORT_SYMBOL(fs_overflowgid); */ int C_A_D = 1; -int cad_pid = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); /* * Notifier list for kernel code which wants to be called @@ -151,7 +153,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, /* * Atomic notifier chain routines. Registration and unregistration - * use a mutex, and call_chain is synchronized by RCU (no locks). + * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** @@ -220,7 +222,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * of the last notifier function called. */ -int atomic_notifier_call_chain(struct atomic_notifier_head *nh, +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; @@ -399,6 +401,129 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, EXPORT_SYMBOL_GPL(raw_notifier_call_chain); +/* + * SRCU notifier chain routines. Registration and unregistration + * use a mutex, and call_chain is synchronized by SRCU (no locks). + */ + +/** + * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an SRCU notifier chain. + * Must be called in process context. + * + * Currently always returns zero. + */ + +int srcu_notifier_chain_register(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_register(&nh->head, n); + mutex_unlock(&nh->mutex); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); + +/** + * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an SRCU notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_unregister(&nh->head, n); + mutex_unlock(&nh->mutex); + synchronize_srcu(&nh->srcu); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); + +/** + * srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ + +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) +{ + int ret; + int idx; + + idx = srcu_read_lock(&nh->srcu); + ret = notifier_call_chain(&nh->head, val, v); + srcu_read_unlock(&nh->srcu, idx); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); + +/** + * srcu_init_notifier_head - Initialize an SRCU notifier head + * @nh: Pointer to head of the srcu notifier chain + * + * Unlike other sorts of notifier heads, SRCU notifier heads require + * dynamic initialization. Be sure to call this routine before + * calling any of the other SRCU notifier routines for this head. + * + * If an SRCU notifier head is deallocated, it must first be cleaned + * up by calling srcu_cleanup_notifier_head(). Otherwise the head's + * per-cpu data (used by the SRCU mechanism) will leak. + */ + +void srcu_init_notifier_head(struct srcu_notifier_head *nh) +{ + mutex_init(&nh->mutex); + if (init_srcu_struct(&nh->srcu) < 0) + BUG(); + nh->head = NULL; +} + +EXPORT_SYMBOL_GPL(srcu_init_notifier_head); + /** * register_reboot_notifier - Register function to be called at reboot time * @nb: Info about notifier function to be called @@ -606,12 +731,10 @@ static void kernel_restart_prepare(char *cmd) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - if (!cmd) { + if (!cmd) printk(KERN_EMERG "Restarting system.\n"); - } else { + else printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - } - printk(".\n"); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -627,9 +750,8 @@ static void kernel_kexec(void) #ifdef CONFIG_KEXEC struct kimage *image; image = xchg(&kexec_image, NULL); - if (!image) { + if (!image) return; - } kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); @@ -775,10 +897,9 @@ void ctrl_alt_del(void) if (C_A_D) schedule_work(&cad_work); else - kill_proc(cad_pid, SIGINT, 1); + kill_cad_pid(SIGINT, 1); } - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -823,12 +944,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) (current->sgid == egid) || capable(CAP_SETGID)) new_egid = egid; - else { + else return -EPERM; - } } - if (new_egid != old_egid) - { + if (new_egid != old_egid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -857,19 +976,14 @@ asmlinkage long sys_setgid(gid_t gid) if (retval) return retval; - if (capable(CAP_SETGID)) - { - if(old_egid != gid) - { + if (capable(CAP_SETGID)) { + if (old_egid != gid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } current->gid = current->egid = current->sgid = current->fsgid = gid; - } - else if ((gid == current->gid) || (gid == current->sgid)) - { - if(old_egid != gid) - { + } else if ((gid == current->gid) || (gid == current->sgid)) { + if (old_egid != gid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -900,8 +1014,7 @@ static int set_user(uid_t new_ruid, int dumpclear) switch_uid(new_user); - if(dumpclear) - { + if (dumpclear) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -957,8 +1070,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) return -EAGAIN; - if (new_euid != old_euid) - { + if (new_euid != old_euid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1008,8 +1120,7 @@ asmlinkage long sys_setuid(uid_t uid) } else if ((uid != current->uid) && (uid != new_suid)) return -EPERM; - if (old_euid != uid) - { + if (old_euid != uid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1054,8 +1165,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) return -EAGAIN; } if (euid != (uid_t) -1) { - if (euid != current->euid) - { + if (euid != current->euid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1105,8 +1215,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) return -EPERM; } if (egid != (gid_t) -1) { - if (egid != current->egid) - { + if (egid != current->egid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1151,10 +1260,8 @@ asmlinkage long sys_setfsuid(uid_t uid) if (uid == current->uid || uid == current->euid || uid == current->suid || uid == current->fsuid || - capable(CAP_SETUID)) - { - if (uid != old_fsuid) - { + capable(CAP_SETUID)) { + if (uid != old_fsuid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1182,10 +1289,8 @@ asmlinkage long sys_setfsgid(gid_t gid) if (gid == current->gid || gid == current->egid || gid == current->sgid || gid == current->fsgid || - capable(CAP_SETGID)) - { - if (gid != old_fsgid) - { + capable(CAP_SETGID)) { + if (gid != old_fsgid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1321,9 +1426,9 @@ out: asmlinkage long sys_getpgid(pid_t pid) { - if (!pid) { + if (!pid) return process_group(current); - } else { + else { int retval; struct task_struct *p; @@ -1353,9 +1458,9 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { - if (!pid) { + if (!pid) return current->signal->session; - } else { + else { int retval; struct task_struct *p; @@ -1363,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid) p = find_task_by_pid(pid); retval = -ESRCH; - if(p) { + if (p) { retval = security_task_getsid(p); if (!retval) retval = p->signal->session; @@ -1431,9 +1536,9 @@ struct group_info *groups_alloc(int gidsetsize) group_info->nblocks = nblocks; atomic_set(&group_info->usage, 1); - if (gidsetsize <= NGROUPS_SMALL) { + if (gidsetsize <= NGROUPS_SMALL) group_info->blocks[0] = group_info->small_block; - } else { + else { for (i = 0; i < nblocks; i++) { gid_t *b; b = (void *)__get_free_page(GFP_USER); @@ -1489,7 +1594,7 @@ static int groups_to_user(gid_t __user *grouplist, /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) - { +{ int i; int count = group_info->ngroups; @@ -1647,9 +1752,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) int in_group_p(gid_t grp) { int retval = 1; - if (grp != current->fsgid) { + if (grp != current->fsgid) retval = groups_search(current->group_info, grp); - } return retval; } @@ -1658,9 +1762,8 @@ EXPORT_SYMBOL(in_group_p); int in_egroup_p(gid_t grp) { int retval = 1; - if (grp != current->egid) { + if (grp != current->egid) retval = groups_search(current->group_info, grp); - } return retval; } @@ -1675,7 +1778,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name) int errno = 0; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1693,8 +1796,8 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.nodename, tmp, len); - system_utsname.nodename[len] = 0; + memcpy(utsname()->nodename, tmp, len); + utsname()->nodename[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1710,11 +1813,11 @@ asmlinkage long sys_gethostname(char __user *name, int len) if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(system_utsname.nodename); + i = 1 + strlen(utsname()->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, system_utsname.nodename, i)) + if (copy_to_user(name, utsname()->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1739,8 +1842,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.domainname, tmp, len); - system_utsname.domainname[len] = 0; + memcpy(utsname()->domainname, tmp, len); + utsname()->domainname[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1775,9 +1878,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); - if(x.rlim_cur > 0x7FFFFFFF) + if (x.rlim_cur > 0x7FFFFFFF) x.rlim_cur = 0x7FFFFFFF; - if(x.rlim_max > 0x7FFFFFFF) + if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; } @@ -2062,3 +2165,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, } return error; } + +asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, + struct getcpu_cache __user *cache) +{ + int err = 0; + int cpu = raw_smp_processor_id(); + if (cpup) + err |= put_user(cpu, cpup); + if (nodep) + err |= put_user(cpu_to_node(cpu), nodep); + if (cache) { + /* + * The cache is not needed for this implementation, + * but make sure user programs pass something + * valid. vsyscall implementations can instead make + * good use of the cache. Only use t0 and t1 because + * these are available in both 32bit and 64bit ABI (no + * need for a compat_getcpu). 32bit has enough + * padding + */ + unsigned long t0, t1; + get_user(t0, &cache->blob[0]); + get_user(t1, &cache->blob[1]); + t0++; + t1++; + put_user(t0, &cache->blob[0]); + put_user(t1, &cache->blob[1]); + } + return err ? -EFAULT : 0; +} diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6991bece67e8..7a3b2e75f040 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -134,3 +134,8 @@ cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); + +/* block-layer dependent */ +cond_syscall(sys_bdflush); +cond_syscall(sys_ioprio_set); +cond_syscall(sys_ioprio_get); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fd43c3e6786b..8020fb273c4f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -52,6 +52,10 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_X86 +#include <asm/nmi.h> +#endif + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -64,7 +68,6 @@ extern int sysrq_enabled; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; -extern int cad_pid; extern int pid_max; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; @@ -74,12 +77,6 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) -int unknown_nmi_panic; -extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, - void __user *, size_t *, loff_t *); -#endif - /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; @@ -94,13 +91,8 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -extern size_t shm_ctlmax; -extern size_t shm_ctlall; -extern int shm_ctlmni; -extern int msg_ctlmax; -extern int msg_ctlmnb; -extern int msg_ctlmni; -extern int sem_ctls[]; +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif #ifdef __sparc__ @@ -136,9 +128,15 @@ extern int no_unaligned_warning; extern int max_lock_depth; #endif -static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, - ctl_table *, void **); -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +#ifdef CONFIG_SYSCTL_SYSCALL +static int parse_table(int __user *, int, void __user *, size_t __user *, + void __user *, size_t, ctl_table *, void **); +#endif + +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); static ctl_table root_table[]; @@ -164,7 +162,7 @@ int sysctl_legacy_va_layout; /* /proc declarations: */ -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); @@ -228,51 +226,100 @@ static ctl_table root_table[] = { }; static ctl_table kern_table[] = { +#ifndef CONFIG_UTS_NS + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, +#else /* !CONFIG_UTS_NS */ { .ctl_name = KERN_OSTYPE, .procname = "ostype", - .data = system_utsname.sysname, - .maxlen = sizeof(system_utsname.sysname), + .data = NULL, + /* could maybe use __NEW_UTS_LEN here? */ + .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_OSRELEASE, .procname = "osrelease", - .data = system_utsname.release, - .maxlen = sizeof(system_utsname.release), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, release), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_VERSION, .procname = "version", - .data = system_utsname.version, - .maxlen = sizeof(system_utsname.version), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, version), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_NODENAME, .procname = "hostname", - .data = system_utsname.nodename, - .maxlen = sizeof(system_utsname.nodename), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), .mode = 0644, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_DOMAINNAME, .procname = "domainname", - .data = system_utsname.domainname, - .maxlen = sizeof(system_utsname.domainname), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), .mode = 0644, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, +#endif /* !CONFIG_UTS_NS */ { .ctl_name = KERN_PANIC, .procname = "panic", @@ -293,7 +340,7 @@ static ctl_table kern_table[] = { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, - .maxlen = 64, + .maxlen = 128, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, @@ -431,58 +478,58 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = &shm_ctlmax, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = &shm_ctlall, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = &shm_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = &msg_ctlmax, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = &msg_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = &msg_ctlmnb, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = &sem_ctls, + .data = NULL, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -498,10 +545,10 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_CADPID, .procname = "cad_pid", - .data = &cad_pid, + .data = NULL, .maxlen = sizeof (int), .mode = 0600, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_cad_pid, }, { .ctl_name = KERN_MAX_THREADS, @@ -628,11 +675,27 @@ static ctl_table kern_table[] = { .data = &unknown_nmi_panic, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_unknown_nmi_panic, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_NMI_WATCHDOG, + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_nmi_enabled, }, #endif #if defined(CONFIG_X86) { + .ctl_name = KERN_PANIC_ON_NMI, + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", .data = &bootloader_type, @@ -1149,12 +1212,13 @@ static void start_unregistering(struct ctl_table_header *p) void __init sysctl_init(void) { -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL register_proc_table(root_table, proc_sys_root, &root_table_header); init_irq_proc(); #endif } +#ifdef CONFIG_SYSCTL_SYSCALL int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1208,6 +1272,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) unlock_kernel(); return error; } +#endif /* CONFIG_SYSCTL_SYSCALL */ /* * ctl_perm does NOT grant the superuser all rights automatically, because @@ -1234,6 +1299,7 @@ static inline int ctl_perm(ctl_table *table, int op) return test_perm(table->mode, op); } +#ifdef CONFIG_SYSCTL_SYSCALL static int parse_table(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, @@ -1323,6 +1389,7 @@ int do_sysctl_strategy (ctl_table *table, } return 0; } +#endif /* CONFIG_SYSCTL_SYSCALL */ /** * register_sysctl_table - register a sysctl hierarchy @@ -1410,7 +1477,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, else list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); spin_unlock(&sysctl_lock); -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL register_proc_table(table, proc_sys_root, tmp); #endif return tmp; @@ -1428,18 +1495,31 @@ void unregister_sysctl_table(struct ctl_table_header * header) might_sleep(); spin_lock(&sysctl_lock); start_unregistering(header); -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL unregister_proc_table(header->ctl_table, proc_sys_root); #endif spin_unlock(&sysctl_lock); kfree(header); } +#else /* !CONFIG_SYSCTL */ +struct ctl_table_header * register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + return NULL; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +#endif /* CONFIG_SYSCTL */ + /* * /proc/sys support */ -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL /* Scan the sysctl entries in table and add them all into /proc */ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) @@ -1590,32 +1670,15 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf, return do_rw_proc(1, file, (char __user *) buf, count, ppos); } -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int _proc_do_string(void* data, int maxlen, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) { size_t len; char __user *p; char c; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; @@ -1631,20 +1694,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, break; len++; } - if (len >= table->maxlen) - len = table->maxlen-1; - if(copy_from_user(table->data, buffer, len)) + if (len >= maxlen) + len = maxlen-1; + if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) table->data)[len] = 0; + ((char *) data)[len] = 0; *ppos += *lenp; } else { - len = strlen(table->data); - if (len > table->maxlen) - len = table->maxlen; + len = strlen(data); + if (len > maxlen) + len = maxlen; if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, table->data, len)) + if(copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { if(put_user('\n', ((char __user *) buffer) + len)) @@ -1657,12 +1720,38 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return 0; } +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_do_string(table->data, table->maxlen, write, filp, + buffer, lenp, ppos); +} + /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +#ifndef CONFIG_UTS_NS +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; @@ -1678,6 +1767,48 @@ static int proc_doutsstring(ctl_table *table, int write, struct file *filp, } return r; } +#else /* !CONFIG_UTS_NS */ +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + struct uts_namespace* uts_ns = current->nsproxy->uts_ns; + char* which; + + switch (table->ctl_name) { + case KERN_OSTYPE: + which = uts_ns->name.sysname; + break; + case KERN_NODENAME: + which = uts_ns->name.nodename; + break; + case KERN_OSRELEASE: + which = uts_ns->name.release; + break; + case KERN_VERSION: + which = uts_ns->name.version; + break; + case KERN_DOMAINNAME: + which = uts_ns->name.domainname; + break; + default: + r = -EINVAL; + goto out; + } + + if (!write) { + down_read(&uts_sem); + r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); + up_read(&uts_sem); + } else { + down_write(&uts_sem); + r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); + up_write(&uts_sem); + } + out: + return r; +} +#endif /* !CONFIG_UTS_NS */ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1698,8 +1829,9 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos, +static int __do_proc_dointvec(void *tbl_data, ctl_table *table, + int write, struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -1712,13 +1844,13 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (int *) table->data; + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -1807,6 +1939,16 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, #undef TMPBUFLEN } +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, filp, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1878,7 +2020,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, return -EPERM; } - op = (current->pid == 1) ? OP_SET : OP_AND; + op = is_init(current) ? OP_SET : OP_AND; return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, do_proc_dointvec_bset_conv,&op); } @@ -1940,7 +2082,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, do_proc_dointvec_minmax_conv, ¶m); } -static int do_proc_doulongvec_minmax(ctl_table *table, int write, +static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, @@ -1954,13 +2096,13 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (unsigned long *) table->data; + i = (unsigned long *) data; min = (unsigned long *) table->extra1; max = (unsigned long *) table->extra2; vleft = table->maxlen / sizeof(unsigned long); @@ -2045,6 +2187,17 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, #undef TMPBUFLEN } +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + filp, buffer, lenp, ppos, convmul, convdiv); +} + /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table @@ -2233,6 +2386,71 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, do_proc_dointvec_ms_jiffies_conv, NULL); } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *data; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + switch (table->ctl_name) { + case KERN_SHMMAX: + data = &ns->shm_ctlmax; + goto proc_minmax; + case KERN_SHMALL: + data = &ns->shm_ctlall; + goto proc_minmax; + case KERN_SHMMNI: + data = &ns->shm_ctlmni; + break; + case KERN_MSGMAX: + data = &ns->msg_ctlmax; + break; + case KERN_MSGMNI: + data = &ns->msg_ctlmni; + break; + case KERN_MSGMNB: + data = &ns->msg_ctlmnb; + break; + case KERN_SEM: + data = &ns->sem_ctls; + break; + default: + return -EINVAL; + } + + return __do_proc_dointvec(data, table, write, filp, buffer, + lenp, ppos, NULL, NULL); +proc_minmax: + return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, + lenp, ppos, 1l, 1l); +} +#endif + +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_nr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + #else /* CONFIG_PROC_FS */ int proc_dostring(ctl_table *table, int write, struct file *filp, @@ -2241,12 +2459,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return -ENOSYS; } -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif + int proc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2301,6 +2527,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SYSCTL_SYSCALL /* * General sysctl support routines */ @@ -2443,11 +2670,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return 1; } -#else /* CONFIG_SYSCTL */ +#else /* CONFIG_SYSCTL_SYSCALL */ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) { + static int msg_count; + + if (msg_count < 5) { + msg_count++; + printk(KERN_INFO + "warning: process `%s' used the removed sysctl " + "system call\n", current->comm); + } return -ENOSYS; } @@ -2479,73 +2714,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return -ENOSYS; } -int proc_dostring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, - struct file *filp, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -struct ctl_table_header * register_sysctl_table(ctl_table * table, - int insert_at_head) -{ - return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SYSCTL_SYSCALL */ /* * No sense putting this after each symbol definition, twice, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2ed4040d0dc5..5d6a8c54ee85 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -18,7 +18,9 @@ #include <linux/kernel.h> #include <linux/taskstats_kern.h> +#include <linux/tsacct_kern.h> #include <linux/delayacct.h> +#include <linux/tsacct_kern.h> #include <linux/cpumask.h> #include <linux/percpu.h> #include <net/genetlink.h> @@ -75,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(size, GFP_KERNEL); + skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); if (!skb) return -ENOMEM; @@ -198,7 +200,13 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, */ delayacct_add_tsk(stats, tsk); + + /* fill in basic acct fields */ stats->version = TASKSTATS_VERSION; + bacct_add_tsk(stats, tsk); + + /* fill in extended acct fields */ + xacct_add_tsk(stats, tsk); /* Define err: label here if needed */ put_task_struct(tsk); diff --git a/kernel/time.c b/kernel/time.c index 5bd489747643..0e017bff4c19 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -/* we call this to notify the arch when the clock is being - * controlled. If no such arch routine, do nothing. - */ -void __attribute__ ((weak)) notify_arch_cmos_timer(void) -{ - return; -} - -/* adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. - */ -int do_adjtimex(struct timex *txc) -{ - long ltemp, mtemp, save_adjust; - int result; - - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - /* singleshot must not be used with any other mode bits */ - if (txc->modes != ADJ_OFFSET_SINGLESHOT) - return -EINVAL; - - if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) - /* adjustment Offset limited to +- .512 seconds */ - if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) - return -EINVAL; - - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - result = time_state; /* mostly `TIME_OK' */ - - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_next_adjust ? time_next_adjust : time_adjust; - -#if 0 /* STA_CLOCKERR is never set yet */ - time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ -#endif - /* If there are input parameters, then process them */ - if (txc->modes) - { - if (txc->modes & ADJ_STATUS) /* only set allowed bits */ - time_status = (txc->status & ~STA_RONLY) | - (time_status & STA_RONLY); - - if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ - if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { - result = -EINVAL; - goto leave; - } - time_freq = txc->freq; - } - - if (txc->modes & ADJ_MAXERROR) { - if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_maxerror = txc->maxerror; - } - - if (txc->modes & ADJ_ESTERROR) { - if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_esterror = txc->esterror; - } - - if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ - if (txc->constant < 0) { /* NTP v4 uses values > 6 */ - result = -EINVAL; - goto leave; - } - time_constant = txc->constant; - } - - if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ - if (txc->modes == ADJ_OFFSET_SINGLESHOT) { - /* adjtime() is independent from ntp_adjtime() */ - if ((time_next_adjust = txc->offset) == 0) - time_adjust = 0; - } - else if (time_status & STA_PLL) { - ltemp = txc->offset; - - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ - if (ltemp > MAXPHASE) - time_offset = MAXPHASE << SHIFT_UPDATE; - else if (ltemp < -MAXPHASE) - time_offset = -(MAXPHASE << SHIFT_UPDATE); - else - time_offset = ltemp << SHIFT_UPDATE; - - /* - * Select whether the frequency is to be controlled - * and in which mode (PLL or FLL). Clamp to the operating - * range. Ugly multiply/divide should be replaced someday. - */ - - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; - if (time_status & STA_FLL) { - if (mtemp >= MINSEC) { - ltemp = (time_offset / mtemp) << (SHIFT_USEC - - SHIFT_UPDATE); - time_freq += shift_right(ltemp, SHIFT_KH); - } else /* calibration interval too short (p. 12) */ - result = TIME_ERROR; - } else { /* PLL mode */ - if (mtemp < MAXSEC) { - ltemp *= mtemp; - time_freq += shift_right(ltemp,(time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC)); - } else /* calibration interval too long (p. 12) */ - result = TIME_ERROR; - } - time_freq = min(time_freq, time_tolerance); - time_freq = max(time_freq, -time_tolerance); - } /* STA_PLL */ - } /* txc->modes & ADJ_OFFSET */ - if (txc->modes & ADJ_TICK) { - tick_usec = txc->tick; - tick_nsec = TICK_USEC_TO_NSEC(tick_usec); - } - } /* txc->modes */ -leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) - result = TIME_ERROR; - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset, SHIFT_UPDATE); - } - txc->freq = time_freq; - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; - txc->precision = time_precision; - txc->tolerance = time_tolerance; - txc->tick = tick_usec; - - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; - write_sequnlock_irq(&xtime_lock); - do_gettimeofday(&txc->time); - notify_arch_cmos_timer(); - return(result); -} - asmlinkage long sys_adjtimex(struct timex __user *txc_p) { struct timex txc; /* Local copy of parameter */ diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e1dfd8e86cce..61a3907d16fb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1 +1 @@ -obj-y += clocksource.o jiffies.o +obj-y += ntp.o clocksource.o jiffies.o diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c new file mode 100644 index 000000000000..47195fa0ec4f --- /dev/null +++ b/kernel/time/ntp.c @@ -0,0 +1,350 @@ +/* + * linux/kernel/time/ntp.c + * + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ + +#include <linux/mm.h> +#include <linux/time.h> +#include <linux/timex.h> + +#include <asm/div64.h> +#include <asm/timex.h> + +/* + * Timekeeping variables + */ +unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ +unsigned long tick_nsec; /* ACTHZ period (nsec) */ +static u64 tick_length, tick_length_base; + +#define MAX_TICKADJ 500 /* microsecs */ +#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ + TICK_LENGTH_SHIFT) / HZ) + +/* + * phase-lock loop variables + */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +static int time_state = TIME_OK; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +static long time_offset; /* time adjustment (ns) */ +static long time_constant = 2; /* pll time constant */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +long time_freq; /* frequency offset (scaled ppm)*/ +static long time_reftime; /* time at last adjustment (s) */ +long time_adjust; + +#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) +#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \ + (s64)CLOCK_TICK_RATE) + +static void ntp_update_frequency(void) +{ + tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; + tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + + do_div(tick_length_base, HZ); + + tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; +} + +/** + * ntp_clear - Clears the NTP state variables + * + * Must be called while holding a write on the xtime_lock + */ +void ntp_clear(void) +{ + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + + tick_length = tick_length_base; + time_offset = 0; +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + */ +void second_overflow(void) +{ + long time_adj; + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ >> SHIFT_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. The microtime() + * routine or external clock driver will insure that reported time is + * always monotonic. The ugly divides should be replaced. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + /* + * The timer interpolator will make time change + * gradually instead of an immediate jump by one second + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; + clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second " + "23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + wall_to_monotonic.tv_sec--; + /* + * Use of time interpolator for a gradual change of + * time + */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; + clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second " + "23:59:59 UTC\n"); + } + break; + case TIME_OOP: + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. The offset is + * reduced by a fixed factor times the time constant. + */ + tick_length = tick_length_base; + time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); + time_offset -= time_adj; + tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); + + if (unlikely(time_adjust)) { + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + } else if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + } else { + time_adjust = 0; + tick_length += (s64)(time_adjust * NSEC_PER_USEC / + HZ) << TICK_LENGTH_SHIFT; + } + } +} + +/* + * Return how long ticks are at the moment, that is, how much time + * update_wall_time_one_tick will add to xtime next time we call it + * (assuming no calls to do_adjtimex in the meantime). + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. + * This function has no side-effects. + */ +u64 current_tick_length(void) +{ + return tick_length; +} + + +void __attribute__ ((weak)) notify_arch_cmos_timer(void) +{ + return; +} + +/* adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int do_adjtimex(struct timex *txc) +{ + long ltemp, mtemp, save_adjust; + s64 freq_adj, temp64; + int result; + + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* Now we validate the data before disabling interrupts */ + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + /* singleshot must not be used with any other mode bits */ + if (txc->modes != ADJ_OFFSET_SINGLESHOT) + return -EINVAL; + + if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) + /* adjustment Offset limited to +- .512 seconds */ + if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) + return -EINVAL; + + /* if the quartz is off by more than 10% something is VERY wrong ! */ + if (txc->modes & ADJ_TICK) + if (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ) + return -EINVAL; + + write_seqlock_irq(&xtime_lock); + result = time_state; /* mostly `TIME_OK' */ + + /* Save for later - semantics of adjtime is to return old value */ + save_adjust = time_adjust; + +#if 0 /* STA_CLOCKERR is never set yet */ + time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ +#endif + /* If there are input parameters, then process them */ + if (txc->modes) + { + if (txc->modes & ADJ_STATUS) /* only set allowed bits */ + time_status = (txc->status & ~STA_RONLY) | + (time_status & STA_RONLY); + + if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ + if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { + result = -EINVAL; + goto leave; + } + time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); + } + + if (txc->modes & ADJ_MAXERROR) { + if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_maxerror = txc->maxerror; + } + + if (txc->modes & ADJ_ESTERROR) { + if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_esterror = txc->esterror; + } + + if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ + if (txc->constant < 0) { /* NTP v4 uses values > 6 */ + result = -EINVAL; + goto leave; + } + time_constant = min(txc->constant + 4, (long)MAXTC); + } + + if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ + if (txc->modes == ADJ_OFFSET_SINGLESHOT) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + } + else if (time_status & STA_PLL) { + ltemp = txc->offset * NSEC_PER_USEC; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC); + time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC); + + /* + * Select whether the frequency is to be controlled + * and in which mode (PLL or FLL). Clamp to the operating + * range. Ugly multiply/divide should be replaced someday. + */ + + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = xtime.tv_sec; + mtemp = xtime.tv_sec - time_reftime; + time_reftime = xtime.tv_sec; + + freq_adj = (s64)time_offset * mtemp; + freq_adj = shift_right(freq_adj, time_constant * 2 + + (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); + if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL); + if (time_offset < 0) { + temp64 = -temp64; + do_div(temp64, mtemp); + freq_adj -= temp64; + } else { + do_div(temp64, mtemp); + freq_adj += temp64; + } + } + freq_adj += time_freq; + freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); + time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); + time_offset = (time_offset / HZ) << SHIFT_UPDATE; + } /* STA_PLL */ + } /* txc->modes & ADJ_OFFSET */ + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); + } /* txc->modes */ +leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) + result = TIME_ERROR; + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + txc->offset = save_adjust; + else + txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; + txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = 1; + txc->tolerance = MAXFREQ; + txc->tick = tick_usec; + + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; + write_sequnlock_irq(&xtime_lock); + do_gettimeofday(&txc->time); + notify_arch_cmos_timer(); + return(result); +} diff --git a/kernel/timer.c b/kernel/timer.c index 1d7dd6267c2d..c1c7fbcffec1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -41,12 +41,6 @@ #include <asm/timex.h> #include <asm/io.h> -#ifdef CONFIG_TIME_INTERPOLATION -static void time_interpolator_update(long delta_nsec); -#else -#define time_interpolator_update(x) -#endif - u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -136,7 +130,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } -/*** +/** * init_timer - initialize a timer. * @timer: the timer to be initialized * @@ -175,6 +169,7 @@ static inline void detach_timer(struct timer_list *timer, */ static tvec_base_t *lock_timer_base(struct timer_list *timer, unsigned long *flags) + __acquires(timer->base->lock) { tvec_base_t *base; @@ -235,7 +230,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(__mod_timer); -/*** +/** * add_timer_on - start a timer on a particular CPU * @timer: the timer to be added * @cpu: the CPU to start it on @@ -255,9 +250,10 @@ void add_timer_on(struct timer_list *timer, int cpu) } -/*** +/** * mod_timer - modify a timer's timeout * @timer: the timer to be modified + * @expires: new timeout in jiffies * * mod_timer is a more efficient way to update the expire field of an * active timer (if the timer is inactive it will be activated) @@ -291,7 +287,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(mod_timer); -/*** +/** * del_timer - deactive a timer. * @timer: the timer to be deactivated * @@ -323,7 +319,10 @@ int del_timer(struct timer_list *timer) EXPORT_SYMBOL(del_timer); #ifdef CONFIG_SMP -/* +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del + * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. * @@ -351,7 +350,7 @@ out: return ret; } -/*** +/** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated * @@ -401,15 +400,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) return index; } -/*** +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + +/** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. * * This function cascades all vectors and executes all expired timer * vectors. */ -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - static inline void __run_timers(tvec_base_t *base) { struct timer_list *timer; @@ -563,12 +562,6 @@ found: /******************************************************************/ -/* - * Timekeeping variables - */ -unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ -unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ - /* * The current time * wall_to_monotonic is what we need to add to xtime (or xtime corrected @@ -582,209 +575,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16))); EXPORT_SYMBOL(xtime); -/* Don't completely fail for HZ > 500. */ -int tickadj = 500/HZ ? : 1; /* microsecs */ - - -/* - * phase-lock loop variables - */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -long time_offset; /* time adjustment (us) */ -long time_constant = 2; /* pll time constant */ -long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ -long time_precision = 1; /* clock precision (us) */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; - /* frequency offset (scaled ppm)*/ -static long time_adj; /* tick adjust (scaled 1 / HZ) */ -long time_reftime; /* time at last adjustment (s) */ -long time_adjust; -long time_next_adjust; - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - * - */ -static void second_overflow(void) -{ - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. The microtime() - * routine or external clock driver will insure that reported time is - * always monotonic. The ugly divides should be replaced. - */ - switch (time_state) { - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - /* - * The timer interpolator will make time change - * gradually instead of an immediate jump by one second - */ - time_interpolator_update(-NSEC_PER_SEC); - time_state = TIME_OOP; - clock_was_set(); - printk(KERN_NOTICE "Clock: inserting leap second " - "23:59:60 UTC\n"); - } - break; - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - /* - * Use of time interpolator for a gradual change of - * time - */ - time_interpolator_update(NSEC_PER_SEC); - time_state = TIME_WAIT; - clock_was_set(); - printk(KERN_NOTICE "Clock: deleting leap second " - "23:59:59 UTC\n"); - } - break; - case TIME_OOP: - time_state = TIME_WAIT; - break; - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In PLL mode, the - * offset is reduced by a fixed factor times the time constant. In FLL - * mode the offset is used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread the adjustment - * over not more than the number of seconds between updates. - */ - ltemp = time_offset; - if (!(time_status & STA_FLL)) - ltemp = shift_right(ltemp, SHIFT_KG + time_constant); - ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); - ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); - time_offset -= ltemp; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - - /* - * Compute the frequency estimate and additional phase adjustment due - * to frequency error for the next second. - */ - ltemp = time_freq; - time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); - -#if HZ == 100 - /* - * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to - * get 128.125; => only 0.125% error (p. 14) - */ - time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); -#endif -#if HZ == 250 - /* - * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 255.85938; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -#if HZ == 1000 - /* - * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -} - -/* - * Returns how many microseconds we need to add to xtime this tick - * in doing an adjustment requested with adjtime. - */ -static long adjtime_adjustment(void) -{ - long time_adjust_step; - - time_adjust_step = time_adjust; - if (time_adjust_step) { - /* - * We are doing an adjtime thing. Prepare time_adjust_step to - * be within bounds. Note that a positive time_adjust means we - * want the clock to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - time_adjust_step = min(time_adjust_step, (long)tickadj); - time_adjust_step = max(time_adjust_step, (long)-tickadj); - } - return time_adjust_step; -} - -/* in the NTP reference this is called "hardclock()" */ -static void update_ntp_one_tick(void) -{ - long time_adjust_step; - - time_adjust_step = adjtime_adjustment(); - if (time_adjust_step) - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; - - /* Changes by adjtime() do not take effect till next tick. */ - if (time_next_adjust != 0) { - time_adjust = time_next_adjust; - time_next_adjust = 0; - } -} - -/* - * Return how long ticks are at the moment, that is, how much time - * update_wall_time_one_tick will add to xtime next time we call it - * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds shifted by the - * specified number of bits to the right of the binary point. - * This function has no side-effects. - */ -u64 current_tick_length(void) -{ - long delta_nsec; - u64 ret; - - /* calculate the finest interval NTP will allow. - * ie: nanosecond value shifted by (SHIFT_SCALE - 10) - */ - delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; - ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); - - return ret; -} /* XXX - all of this timekeeping code should be later moved to time.c */ #include <linux/clocksource.h> @@ -961,21 +751,24 @@ void __init timekeeping_init(void) unsigned long flags; write_seqlock_irqsave(&xtime_lock, flags); + + ntp_clear(); + clock = clocksource_get_next(); clocksource_calculate_interval(clock, tick_nsec); clock->cycle_last = clocksource_read(clock); - ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); } static int timekeeping_suspended; -/* +/** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused * * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ static int timekeeping_resume(struct sys_device *dev) @@ -1106,7 +899,7 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); } -/* +/** * update_wall_time - Uses the current clocksource to increment the wall time * * Called from the timer interrupt, must hold a write on xtime_lock. @@ -1144,8 +937,6 @@ static void update_wall_time(void) /* interpolator bits */ time_interpolator_update(clock->xtime_interval >> clock->shift); - /* increment the NTP state machine */ - update_ntp_one_tick(); /* accumulate error between NTP and clock interval */ clock->error += current_tick_length(); @@ -1217,19 +1008,14 @@ static inline void calc_load(unsigned long ticks) unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; - count -= ticks; - if (count < 0) { - count += LOAD_FREQ; - active_tasks = count_active_tasks(); + active_tasks = count_active_tasks(); + for (count -= ticks; count < 0; count += LOAD_FREQ) { CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); } } -/* jiffies at the most recent update of wall time */ -unsigned long wall_jiffies = INITIAL_JIFFIES; - /* * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. @@ -1265,12 +1051,8 @@ void run_local_timers(void) * Called by the timer interrupt. xtime_lock must already be taken * by the timer IRQ! */ -static inline void update_times(void) +static inline void update_times(unsigned long ticks) { - unsigned long ticks; - - ticks = jiffies - wall_jiffies; - wall_jiffies += ticks; update_wall_time(); calc_load(ticks); } @@ -1281,12 +1063,10 @@ static inline void update_times(void) * jiffies is defined in the linker script... */ -void do_timer(struct pt_regs *regs) +void do_timer(unsigned long ticks) { - jiffies_64++; - /* prevent loading jiffies before storing new jiffies_64 value. */ - barrier(); - update_times(); + jiffies_64 += ticks; + update_times(ticks); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1470,8 +1250,9 @@ asmlinkage long sys_gettid(void) return current->pid; } -/* +/** * sys_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill */ asmlinkage long sys_sysinfo(struct sysinfo __user *info) { @@ -1688,8 +1469,10 @@ static struct notifier_block __cpuinitdata timers_nb = { void __init init_timers(void) { - timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + + BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); } @@ -1774,7 +1557,7 @@ unsigned long time_interpolator_get_offset(void) #define INTERPOLATOR_ADJUST 65536 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST -static void time_interpolator_update(long delta_nsec) +void time_interpolator_update(long delta_nsec) { u64 counter; unsigned long offset; diff --git a/kernel/tsacct.c b/kernel/tsacct.c new file mode 100644 index 000000000000..db443221ba5b --- /dev/null +++ b/kernel/tsacct.c @@ -0,0 +1,124 @@ +/* + * tsacct.c - System accounting over taskstats interface + * + * Copyright (C) Jay Lan, <jlan@sgi.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/tsacct_kern.h> +#include <linux/acct.h> +#include <linux/jiffies.h> + + +#define USEC_PER_TICK (USEC_PER_SEC/HZ) +/* + * fill in basic accounting fields + */ +void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + struct timespec uptime, ts; + s64 ac_etime; + + BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); + + /* calculate task elapsed time in timespec */ + do_posix_clock_monotonic_gettime(&uptime); + ts = timespec_sub(uptime, current->group_leader->start_time); + /* rebase elapsed time to usec */ + ac_etime = timespec_to_ns(&ts); + do_div(ac_etime, NSEC_PER_USEC); + stats->ac_etime = ac_etime; + stats->ac_btime = xtime.tv_sec - ts.tv_sec; + if (thread_group_leader(tsk)) { + stats->ac_exitcode = tsk->exit_code; + if (tsk->flags & PF_FORKNOEXEC) + stats->ac_flag |= AFORK; + } + if (tsk->flags & PF_SUPERPRIV) + stats->ac_flag |= ASU; + if (tsk->flags & PF_DUMPCORE) + stats->ac_flag |= ACORE; + if (tsk->flags & PF_SIGNALED) + stats->ac_flag |= AXSIG; + stats->ac_nice = task_nice(tsk); + stats->ac_sched = tsk->policy; + stats->ac_uid = tsk->uid; + stats->ac_gid = tsk->gid; + stats->ac_pid = tsk->pid; + stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; + stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; + stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; + stats->ac_minflt = tsk->min_flt; + stats->ac_majflt = tsk->maj_flt; + + strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); +} + + +#ifdef CONFIG_TASK_XACCT + +#define KB 1024 +#define MB (1024*KB) +/* + * fill in extended accounting fields + */ +void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) +{ + /* convert pages-jiffies to Mbyte-usec */ + stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; + stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; + if (p->mm) { + /* adjust to KB unit */ + stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; + stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; + } + stats->read_char = p->rchar; + stats->write_char = p->wchar; + stats->read_syscalls = p->syscr; + stats->write_syscalls = p->syscw; +} +#undef KB +#undef MB + +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ + if (likely(tsk->mm)) { + long delta = cputime_to_jiffies( + cputime_sub(tsk->stime, tsk->acct_stimexpd)); + + if (delta == 0) + return; + tsk->acct_stimexpd = tsk->stime; + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + } +} + +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared + */ +void acct_clear_integrals(struct task_struct *tsk) +{ + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; +} +#endif diff --git a/kernel/unwind.c b/kernel/unwind.c index f69c804c8e62..2e2368607aab 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -102,7 +102,7 @@ static struct unwind_table { unsigned long size; struct unwind_table *link; const char *name; -} root_table, *last_table; +} root_table; struct unwind_item { enum item_location { @@ -174,6 +174,8 @@ void __init unwind_init(void) #ifdef CONFIG_MODULES +static struct unwind_table *last_table; + /* Must be called with module_mutex held. */ void *unwind_add_table(struct module *module, const void *table_start, @@ -603,6 +605,7 @@ int unwind(struct unwind_frame_info *frame) #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) const u32 *fde = NULL, *cie = NULL; const u8 *ptr = NULL, *end = NULL; + unsigned long pc = UNW_PC(frame) - frame->call_frame; unsigned long startLoc = 0, endLoc = 0, cfa; unsigned i; signed ptrType = -1; @@ -612,7 +615,7 @@ int unwind(struct unwind_frame_info *frame) if (UNW_PC(frame) == 0) return -EINVAL; - if ((table = find_table(UNW_PC(frame))) != NULL + if ((table = find_table(pc)) != NULL && !(table->size & (sizeof(*fde) - 1))) { unsigned long tableSize = table->size; @@ -647,7 +650,7 @@ int unwind(struct unwind_frame_info *frame) ptrType & DW_EH_PE_indirect ? ptrType : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); - if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) + if (pc >= startLoc && pc < endLoc) break; cie = NULL; } @@ -657,16 +660,28 @@ int unwind(struct unwind_frame_info *frame) state.cieEnd = ptr; /* keep here temporarily */ ptr = (const u8 *)(cie + 2); end = (const u8 *)(cie + 1) + *cie; + frame->call_frame = 1; if ((state.version = *ptr) != 1) cie = NULL; /* unsupported version */ else if (*++ptr) { /* check if augmentation size is first (and thus present) */ if (*ptr == 'z') { - /* check for ignorable (or already handled) - * nul-terminated augmentation string */ - while (++ptr < end && *ptr) - if (strchr("LPR", *ptr) == NULL) + while (++ptr < end && *ptr) { + switch(*ptr) { + /* check for ignorable (or already handled) + * nul-terminated augmentation string */ + case 'L': + case 'P': + case 'R': + continue; + case 'S': + frame->call_frame = 0; + continue; + default: break; + } + break; + } } if (ptr >= end || *ptr) cie = NULL; @@ -755,7 +770,7 @@ int unwind(struct unwind_frame_info *frame) state.org = startLoc; memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); /* process instructions */ - if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) + if (!processCFI(ptr, end, pc, ptrType, &state) || state.loc > endLoc || state.regs[retAddrReg].where == Nowhere || state.cfa.reg >= ARRAY_SIZE(reg_info) @@ -763,6 +778,11 @@ int unwind(struct unwind_frame_info *frame) || state.cfa.offs % sizeof(unsigned long)) return -EIO; /* update frame */ +#ifndef CONFIG_AS_CFI_SIGNAL_FRAME + if(frame->call_frame + && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign)) + frame->call_frame = 0; +#endif cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; startLoc = min((unsigned long)UNW_SP(frame), cfa); endLoc = max((unsigned long)UNW_SP(frame), cfa); @@ -866,6 +886,7 @@ int unwind_init_frame_info(struct unwind_frame_info *info, /*const*/ struct pt_regs *regs) { info->task = tsk; + info->call_frame = 0; arch_unw_init_frame_info(info, regs); return 0; @@ -879,6 +900,7 @@ int unwind_init_blocked(struct unwind_frame_info *info, struct task_struct *tsk) { info->task = tsk; + info->call_frame = 0; arch_unw_init_blocked(info); return 0; @@ -894,6 +916,7 @@ int unwind_init_running(struct unwind_frame_info *info, void *arg) { info->task = current; + info->call_frame = 0; return arch_unwind_init_running(info, callback, arg); } diff --git a/kernel/utsname.c b/kernel/utsname.c new file mode 100644 index 000000000000..c859164a6993 --- /dev/null +++ b/kernel/utsname.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Author: Serge Hallyn <serue@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/module.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <linux/version.h> + +/* + * Clone a new ns copying an original utsname, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +{ + struct uts_namespace *ns; + + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + if (ns) { + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + kref_init(&ns->kref); + } + return ns; +} + +/* + * unshare the current process' utsname namespace. + * called only in sys_unshare() + */ +int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) +{ + if (unshare_flags & CLONE_NEWUTS) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_uts = clone_uts_ns(current->nsproxy->uts_ns); + if (!*new_uts) + return -ENOMEM; + } + + return 0; +} + +/* + * Copy task tsk's utsname namespace, or clone it if flags + * specifies CLONE_NEWUTS. In latter case, changes to the + * utsname of this process won't be seen by parent, and vice + * versa. + */ +int copy_utsname(int flags, struct task_struct *tsk) +{ + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_uts_ns(old_ns); + + if (!(flags & CLONE_NEWUTS)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = clone_uts_ns(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + tsk->nsproxy->uts_ns = new_ns; + +out: + put_uts_ns(old_ns); + return err; +} + +void free_uts_ns(struct kref *kref) +{ + struct uts_namespace *ns; + + ns = container_of(kref, struct uts_namespace, kref); + kfree(ns); +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 835fe28b87a8..cfc737bffe6d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -34,7 +34,7 @@ * possible cpu). * * The sequence counters are for flush_scheduled_work(). It wants to wait - * until until all currently-scheduled works are completed, but it doesn't + * until all currently-scheduled works are completed, but it doesn't * want to be livelocked by new, incoming ones. So it waits until * remove_sequence is >= the insert_sequence which pertained when * flush_scheduled_work() was called. |