diff options
author | David Woodhouse <dwmw2@infradead.org> | 2006-06-26 17:35:44 +0200 |
---|---|---|
committer | David Woodhouse <dwmw2@infradead.org> | 2006-06-26 17:35:44 +0200 |
commit | 62ed948cb1405fe95d61d8c6445c102e0c9da0a6 (patch) | |
tree | f139adcc861a05e7cc09cdb387a271a652fc2d07 /kernel | |
parent | [MTD] Initialize 'writesize' (diff) | |
parent | [PATCH] uclinux: use PER_LINUX_32BIT in binfmt_flat (diff) | |
download | linux-62ed948cb1405fe95d61d8c6445c102e0c9da0a6.tar.xz linux-62ed948cb1405fe95d61d8c6445c102e0c9da0a6.zip |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 117 | ||||
-rw-r--r-- | kernel/auditsc.c | 1 | ||||
-rw-r--r-- | kernel/compat.c | 30 | ||||
-rw-r--r-- | kernel/cpuset.c | 16 | ||||
-rw-r--r-- | kernel/exit.c | 14 | ||||
-rw-r--r-- | kernel/fork.c | 6 | ||||
-rw-r--r-- | kernel/futex.c | 8 | ||||
-rw-r--r-- | kernel/hrtimer.c | 15 | ||||
-rw-r--r-- | kernel/irq/handle.c | 5 | ||||
-rw-r--r-- | kernel/irq/migration.c | 4 | ||||
-rw-r--r-- | kernel/irq/proc.c | 3 | ||||
-rw-r--r-- | kernel/irq/spurious.c | 12 | ||||
-rw-r--r-- | kernel/kexec.c | 6 | ||||
-rw-r--r-- | kernel/ksysfs.c | 19 | ||||
-rw-r--r-- | kernel/kthread.c | 61 | ||||
-rw-r--r-- | kernel/module.c | 2 | ||||
-rw-r--r-- | kernel/power/Kconfig | 9 | ||||
-rw-r--r-- | kernel/power/disk.c | 2 | ||||
-rw-r--r-- | kernel/power/main.c | 6 | ||||
-rw-r--r-- | kernel/power/power.h | 2 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 148 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 20 | ||||
-rw-r--r-- | kernel/printk.c | 52 | ||||
-rw-r--r-- | kernel/rcupdate.c | 13 | ||||
-rw-r--r-- | kernel/sched.c | 18 | ||||
-rw-r--r-- | kernel/softirq.c | 2 | ||||
-rw-r--r-- | kernel/softlockup.c | 4 | ||||
-rw-r--r-- | kernel/stop_machine.c | 17 | ||||
-rw-r--r-- | kernel/sys.c | 80 | ||||
-rw-r--r-- | kernel/sys_ni.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 22 | ||||
-rw-r--r-- | kernel/timer.c | 32 | ||||
-rw-r--r-- | kernel/user.c | 2 | ||||
-rw-r--r-- | kernel/workqueue.c | 34 |
34 files changed, 499 insertions, 285 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index b327f4d20104..368c4f03fe0e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(long, struct file *); +static void do_acct_process(struct file *); /* * This structure is used so that all the data protected by lock @@ -118,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) + if (vfs_statfs(file->f_dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_vfsmnt); spin_unlock(&acct_globals.lock); - do_acct_process(0, old_acct); + do_acct_process(old_acct); filp_close(old_acct, NULL); spin_lock(&acct_globals.lock); } @@ -419,16 +419,15 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(long exitcode, struct file *file) +static void do_acct_process(struct file *file) { + struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; mm_segment_t fs; - unsigned long vsize; unsigned long flim; u64 elapsed; u64 run_time; struct timespec uptime; - unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -469,12 +468,6 @@ static void do_acct_process(long exitcode, struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - jiffies = cputime_to_jiffies(cputime_add(current->utime, - current->signal->utime)); - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); - jiffies = cputime_to_jiffies(cputime_add(current->stime, - current->signal->stime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -496,37 +489,18 @@ static void do_acct_process(long exitcode, struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - ac.ac_flag = 0; - if (current->flags & PF_FORKNOEXEC) - ac.ac_flag |= AFORK; - if (current->flags & PF_SUPERPRIV) - ac.ac_flag |= ASU; - if (current->flags & PF_DUMPCORE) - ac.ac_flag |= ACORE; - if (current->flags & PF_SIGNALED) - ac.ac_flag |= AXSIG; - - vsize = 0; - if (current->mm) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - vma = current->mm->mmap; - while (vma) { - vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - up_read(¤t->mm->mmap_sem); - } - vsize = vsize / 1024; - ac.ac_mem = encode_comp_t(vsize); + spin_lock(¤t->sighand->siglock); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac.ac_flag = pacct->ac_flag; + ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_minflt = encode_comp_t(pacct->ac_minflt); + ac.ac_majflt = encode_comp_t(pacct->ac_majflt); + ac.ac_exitcode = pacct->ac_exitcode; + spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_minflt = encode_comp_t(current->signal->min_flt + - current->min_flt); - ac.ac_majflt = encode_comp_t(current->signal->maj_flt + - current->maj_flt); ac.ac_swaps = encode_comp_t(0); - ac.ac_exitcode = exitcode; /* * Kernel segment override to datasegment and write it @@ -546,12 +520,63 @@ static void do_acct_process(long exitcode, struct file *file) } /** + * acct_init_pacct - initialize a new pacct_struct + */ +void acct_init_pacct(struct pacct_struct *pacct) +{ + memset(pacct, 0, sizeof(struct pacct_struct)); + pacct->ac_utime = pacct->ac_stime = cputime_zero; +} + +/** + * acct_collect - collect accounting information into pacct_struct + * @exitcode: task exit code + * @group_dead: not 0, if this thread is the last one in the process. + */ +void acct_collect(long exitcode, int group_dead) +{ + struct pacct_struct *pacct = ¤t->signal->pacct; + unsigned long vsize = 0; + + if (group_dead && current->mm) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + vma = current->mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + up_read(¤t->mm->mmap_sem); + } + + spin_lock_irq(¤t->sighand->siglock); + if (group_dead) + pacct->ac_mem = vsize / 1024; + if (thread_group_leader(current)) { + pacct->ac_exitcode = exitcode; + if (current->flags & PF_FORKNOEXEC) + pacct->ac_flag |= AFORK; + } + if (current->flags & PF_SUPERPRIV) + pacct->ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + pacct->ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + pacct->ac_flag |= AXSIG; + pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); + pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_minflt += current->min_flt; + pacct->ac_majflt += current->maj_flt; + spin_unlock_irq(¤t->sighand->siglock); +} + +/** * acct_process - now just a wrapper around do_acct_process * @exitcode: task exit code * * handles process accounting for an exiting task */ -void acct_process(long exitcode) +void acct_process() { struct file *file = NULL; @@ -570,7 +595,7 @@ void acct_process(long exitcode) get_file(file); spin_unlock(&acct_globals.lock); - do_acct_process(exitcode, file); + do_acct_process(file); fput(file); } @@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk) */ void acct_clear_integrals(struct task_struct *tsk) { - if (tsk) { - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; - } + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b097ccb4eb7e..9ebd96fda295 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1558,6 +1558,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) + * @ipcp: in-kernel IPC permissions * * Returns 0 for success or NULL context or < 0 on error. */ diff --git a/kernel/compat.c b/kernel/compat.c index c1601a84f8d8..126dee9530aa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -21,6 +21,7 @@ #include <linux/unistd.h> #include <linux/security.h> #include <linux/timex.h> +#include <linux/migrate.h> #include <asm/uaccess.h> @@ -729,17 +730,10 @@ void sigset_from_compat (sigset_t *set, compat_sigset_t *compat) { switch (_NSIG_WORDS) { -#if defined (__COMPAT_ENDIAN_SWAP__) - case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); - case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); - case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); - case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); -#else case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); -#endif } } @@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) return ret; } + +#ifdef CONFIG_NUMA +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, + compat_uptr_t __user *pages32, + const int __user *nodes, + int __user *status, + int flags) +{ + const void __user * __user *pages; + int i; + + pages = compat_alloc_user_space(nr_pages * sizeof(void *)); + for (i = 0; i < nr_pages; i++) { + compat_uptr_t p; + + if (get_user(p, pages32 + i) || + put_user(compat_ptr(p), pages + i)) + return -EFAULT; + } + return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); +} +#endif diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ab81fdd4572b..b602f73fb38d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -41,6 +41,7 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/seq_file.h> +#include <linux/security.h> #include <linux/slab.h> #include <linux/smp_lock.h> #include <linux/spinlock.h> @@ -392,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, return 0; } -static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static int cpuset_get_sb(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, cpuset_fill_super); + return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); } static struct file_system_type cpuset_fs_type = { @@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; + int retval; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + retval = security_task_setscheduler(tsk, 0, NULL); + if (retval) { + put_task_struct(tsk); + return retval; + } + mutex_lock(&callback_mutex); task_lock(tsk); diff --git a/kernel/exit.c b/kernel/exit.c index e06d0c10a24e..e76bd02e930e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -36,6 +36,7 @@ #include <linux/compat.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ +#include <linux/resource.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -45,8 +46,6 @@ extern void sem_exit (void); extern struct task_struct *child_reaper; -int getrusage(struct task_struct *, int, struct rusage __user *); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) @@ -579,7 +578,7 @@ static void exit_mm(struct task_struct * tsk) down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); - if (mm != tsk->active_mm) BUG(); + BUG_ON(mm != tsk->active_mm); /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; @@ -895,11 +894,11 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); - acct_process(code); } + acct_collect(code, group_dead); if (unlikely(tsk->robust_list)) exit_robust_list(tsk); -#ifdef CONFIG_COMPAT +#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif @@ -907,6 +906,8 @@ fastcall NORET_TYPE void do_exit(long code) audit_free(tsk); exit_mm(tsk); + if (group_dead) + acct_process(); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); @@ -1530,8 +1531,7 @@ check_continued: if (options & __WNOTHREAD) break; tsk = next_thread(tsk); - if (tsk->signal != current->signal) - BUG(); + BUG_ON(tsk->signal != current->signal); } while (tsk != current); read_unlock(&tasklist_lock); diff --git a/kernel/fork.c b/kernel/fork.c index ac8100e3088a..dfd10cb370c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm) */ void mmput(struct mm_struct *mm) { + might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); @@ -623,6 +625,7 @@ out: /* * Allocate a new files structure and copy contents from the * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. */ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { @@ -631,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) int open_files, size, i, expand; struct fdtable *old_fdt, *new_fdt; + *errorp = -ENOMEM; newf = alloc_files(); if (!newf) goto out; @@ -744,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * break this. */ tsk->files = NULL; - error = -ENOMEM; newf = dup_fd(oldf, &error); if (!newf) goto out; @@ -871,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts tsk->it_prof_expires = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); } + acct_init_pacct(&sig->pacct); return 0; } diff --git a/kernel/futex.c b/kernel/futex.c index 5699c512057b..e1a380c77a5a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, (unsigned long)uaddr2, val2, val3); } -static struct super_block * -futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int futexfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); + return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); } static struct file_system_type futex_fs_type = { diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 18324305724a..55601b3ce60e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = /** * ktime_get_ts - get the monotonic clock in timespec format - * * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime @@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) # ifndef CONFIG_KTIME_SCALAR /** * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * * @kt: addend * @nsec: the scalar nsec value to add * @@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /** * hrtimer_forward - forward the timer expiry - * * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward @@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) /** * hrtimer_start - (re)start an relative timer on the current CPU - * * @timer: the timer to be added * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) @@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); /** * hrtimer_try_to_cancel - try to deactivate a timer - * * @timer: hrtimer to stop * * Returns: * 0 when the timer was not active * 1 when the timer was active * -1 when the timer is currently excuting the callback function and - * can not be stopped + * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { @@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. - * * @timer: the timer to be cancelled * * Returns: @@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer - * * @timer: the timer to read */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) @@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void) /** * hrtimer_init - initialize a timer to the given clock - * * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel @@ -576,7 +568,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) clock_id = CLOCK_MONOTONIC; @@ -588,7 +580,6 @@ EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_get_res - get the timer resolution for a clock - * * @which_clock: which clock to query * @tp: pointer to timespec variable to store the resolution * @@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_base *bases; - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); *tp = ktime_to_timespec(bases[which_clock].resolution); return 0; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 51df337b37db..0f6530117105 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) /* * Have got an event to handle: */ -fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, +fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, struct irqaction *action) { - int ret, retval = 0, status = 0; + irqreturn_t ret, retval = IRQ_NONE; + unsigned int status = 0; if (!(action->flags & SA_INTERRUPT)) local_irq_enable(); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 134f9f2e0e39..a12d00eb5e7c 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -30,7 +30,7 @@ void move_native_irq(int irq) desc->move_irq = 0; - if (likely(cpus_empty(pending_irq_cpumask[irq]))) + if (unlikely(cpus_empty(pending_irq_cpumask[irq]))) return; if (!desc->handler->set_affinity) @@ -49,7 +49,7 @@ void move_native_irq(int irq) * cause some ioapics to mal-function. * Being paranoid i guess! */ - if (unlikely(!cpus_empty(tmp))) { + if (likely(!cpus_empty(tmp))) { if (likely(!(desc->status & IRQ_DISABLED))) desc->handler->disable(irq); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d03b5eef8ce0..afacd6f585fa 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; #ifdef CONFIG_GENERIC_PENDING_IRQ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); + /* * Save these away for later use. Re-progam when the * interrupt is pending @@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) #else void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); irq_affinity[irq] = mask_val; irq_desc[irq].handler->set_affinity(irq, mask_val); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7df9abd5ec86..b2fb3c18d06b 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -11,7 +11,7 @@ #include <linux/kallsyms.h> #include <linux/interrupt.h> -static int irqfixup; +static int irqfixup __read_mostly; /* * Recovery handler for misrouted interrupts. @@ -136,9 +136,9 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, struct pt_regs *regs) { - if (action_ret != IRQ_HANDLED) { + if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; - if (action_ret != IRQ_NONE) + if (unlikely(action_ret != IRQ_NONE)) report_bad_irq(irq, desc, action_ret); } @@ -152,11 +152,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, } desc->irq_count++; - if (desc->irq_count < 100000) + if (likely(desc->irq_count < 100000)) return; desc->irq_count = 0; - if (desc->irqs_unhandled > 99900) { + if (unlikely(desc->irqs_unhandled > 99900)) { /* * The interrupt is stuck */ @@ -171,7 +171,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, desc->irqs_unhandled = 0; } -int noirqdebug; +int noirqdebug __read_mostly; int __init noirqdebug_setup(char *str) { diff --git a/kernel/kexec.c b/kernel/kexec.c index bf39d28e4c0e..58f0f382597c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image = NULL; -static struct kimage *kexec_crash_image = NULL; +struct kimage *kexec_image; +struct kimage *kexec_crash_image; /* * A home grown binary mutex. * Nothing can wait so this mutex is safe to use * in interrupt context :) */ -static int kexec_lock = 0; +static int kexec_lock; asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index f119e098e67b..9e28478a17a5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include <linux/sysfs.h> #include <linux/module.h> #include <linux/init.h> +#include <linux/kexec.h> #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_KEXEC +static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_image); +} +KERNEL_ATTR_RO(kexec_loaded); + +static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_crash_image); +} +KERNEL_ATTR_RO(kexec_crash_loaded); +#endif /* CONFIG_KEXEC */ + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -56,6 +71,10 @@ static struct attribute * kernel_attrs[] = { &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif +#ifdef CONFIG_KEXEC + &kexec_loaded_attr.attr, + &kexec_crash_loaded_attr.attr, +#endif NULL }; diff --git a/kernel/kthread.c b/kernel/kthread.c index c5f3c6613b6d..24be714b04c7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -45,6 +45,13 @@ struct kthread_stop_info static DEFINE_MUTEX(kthread_stop_lock); static struct kthread_stop_info kthread_stop_info; +/** + * kthread_should_stop - should this kthread return now? + * + * When someone calls kthread_stop on your kthread, it will be woken + * and this will return true. You should then return, and your return + * value will be passed through to kthread_stop(). + */ int kthread_should_stop(void) { return (kthread_stop_info.k == current); @@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create) complete(&create->done); } +/** + * kthread_create - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(), kthread_create_on_cpu(). + * + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn can either call do_exit() directly if it is a + * standalone thread for which noone will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM). + */ struct task_struct *kthread_create(int (*threadfn)(void *data), void *data, const char namefmt[], @@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @k: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create(). + */ void kthread_bind(struct task_struct *k, unsigned int cpu) { BUG_ON(k->state != TASK_INTERRUPTIBLE); @@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) } EXPORT_SYMBOL(kthread_bind); +/** + * kthread_stop - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_stop() for @k to return true, wakes it, and + * waits for it to exit. Your threadfn() must not call do_exit() + * itself if you use this function! This can also be called after + * kthread_create() instead of calling wake_up_process(): the thread + * will exit without calling threadfn(). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop(struct task_struct *k) { return kthread_stop_sem(k, NULL); } EXPORT_SYMBOL(kthread_stop); +/** + * kthread_stop_sem - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * @s: semaphore that @k waits on while idle. + * + * Does essentially the same thing as kthread_stop() above, but wakes + * @k by calling up(@s). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) { int ret; @@ -210,5 +269,5 @@ static __init int helper_init(void) return 0; } -core_initcall(helper_init); +core_initcall(helper_init); diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b0..d75275de1c28 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1326,7 +1326,7 @@ int is_exported(const char *name, const struct module *mod) if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) return 1; else - if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) + if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) return 1; else return 0; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ce0dfb8f4a4e..fc311a4673a2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -36,6 +36,15 @@ config PM_DEBUG code. This is helpful when debugging and reporting various PM bugs, like suspend support. +config PM_TRACE + bool "Suspend/resume event tracing" + depends on PM && PM_DEBUG && X86_32 + default y + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + config SOFTWARE_SUSPEND bool "Software Suspend" depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 81d4d982f3f0..e13e74067845 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -231,7 +231,7 @@ static int software_resume(void) late_initcall(software_resume); -static char * pm_disk_modes[] = { +static const char * const pm_disk_modes[] = { [PM_DISK_FIRMWARE] = "firmware", [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", diff --git a/kernel/power/main.c b/kernel/power/main.c index 0a907f0dc56b..6d295c776794 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,7 +15,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/pm.h> - +#include <linux/console.h> #include "power.h" @@ -145,7 +145,7 @@ static void suspend_finish(suspend_state_t state) -static char *pm_states[PM_SUSPEND_MAX] = { +static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", #ifdef CONFIG_SOFTWARE_SUSPEND @@ -262,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) { suspend_state_t state = PM_SUSPEND_STANDBY; - char ** s; + const char * const *s; char *p; int error; int len; diff --git a/kernel/power/power.h b/kernel/power/power.h index f06f12f21767..57a792982fb9 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -55,7 +55,7 @@ struct snapshot_handle { unsigned int page; unsigned int page_offset; unsigned int prev; - struct pbe *pbe; + struct pbe *pbe, *last_pbe; void *buffer; unsigned int buf_offset; }; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3eeedbb13b78..24c96f354231 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -150,6 +150,10 @@ int restore_highmem(void) } return 0; } +#else +static inline unsigned int count_highmem_pages(void) {return 0;} +static inline int save_highmem(void) {return 0;} +static inline int restore_highmem(void) {return 0;} #endif static int pfn_is_nosave(unsigned long pfn) @@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) } } -/** - * On resume it is necessary to trace and eventually free the unsafe - * pages that have been allocated, because they are needed for I/O - * (on x86-64 we likely will "eat" these pages once again while - * creating the temporary page translation tables) - */ - -struct eaten_page { - struct eaten_page *next; - char padding[PAGE_SIZE - sizeof(void *)]; -}; - -static struct eaten_page *eaten_pages = NULL; - -static void release_eaten_pages(void) -{ - struct eaten_page *p, *q; - - p = eaten_pages; - while (p) { - q = p->next; - /* We don't want swsusp_free() to free this page again */ - ClearPageNosave(virt_to_page(p)); - free_page((unsigned long)p); - p = q; - } - eaten_pages = NULL; -} +static unsigned int unsafe_pages; /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages - * which had been used before suspend. + * used before suspend. * * The unsafe pages are marked with the PG_nosave_free flag - * - * Allocated but unusable (ie eaten) memory pages should be marked - * so that swsusp_free() can release them + * and we count them using unsafe_pages */ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) { void *res; + res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) - do { + while (res && PageNosaveFree(virt_to_page(res))) { + /* The page is unsafe, mark it for swsusp_free() */ + SetPageNosave(virt_to_page(res)); + unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); - if (res && PageNosaveFree(virt_to_page(res))) { - /* This is for swsusp_free() */ - SetPageNosave(virt_to_page(res)); - ((struct eaten_page *)res)->next = eaten_pages; - eaten_pages = res; - } - } while (res && PageNosaveFree(virt_to_page(res))); - else - res = (void *)get_zeroed_page(gfp_mask); + } if (res) { SetPageNosave(virt_to_page(res)); SetPageNosaveFree(virt_to_page(res)); @@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask) * On each page we set up a list of struct_pbe elements. */ -struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) +static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, + int safe_needed) { unsigned int num; struct pbe *pblist, *pbe; @@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist) return -EFAULT; } + unsafe_pages = 0; + return 0; } @@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, } /** - * create_image - use metadata contained in the PBE list + * prepare_image - use metadata contained in the PBE list * pointed to by pagedir_nosave to mark the pages that will * be overwritten in the process of restoring the system - * memory state from the image and allocate memory for - * the image avoiding these pages + * memory state from the image ("unsafe" pages) and allocate + * memory for the image + * + * The idea is to allocate the PBE list first and then + * allocate as many pages as it's needed for the image data, + * but not to assign these pages to the PBEs initially. + * Instead, we just mark them as allocated and create a list + * of "safe" which will be used later */ -static int create_image(struct snapshot_handle *handle) +struct safe_page { + struct safe_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct safe_page *safe_pages; + +static int prepare_image(struct snapshot_handle *handle) { int error = 0; - struct pbe *p, *pblist; + unsigned int nr_pages = nr_copy_pages; + struct pbe *p, *pblist = NULL; p = pagedir_nosave; error = mark_unsafe_pages(p); if (!error) { - pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); if (pblist) copy_page_backup_list(pblist, p); free_pagedir(p, 0); if (!pblist) error = -ENOMEM; } - if (!error) - error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + safe_pages = NULL; + if (!error && nr_pages > unsafe_pages) { + nr_pages -= unsafe_pages; + while (nr_pages--) { + struct safe_page *ptr; + + ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); + if (!ptr) { + error = -ENOMEM; + break; + } + if (!PageNosaveFree(virt_to_page(ptr))) { + /* The page is "safe", add it to the list */ + ptr->next = safe_pages; + safe_pages = ptr; + } + /* Mark the page as allocated */ + SetPageNosave(virt_to_page(ptr)); + SetPageNosaveFree(virt_to_page(ptr)); + } + } if (!error) { - release_eaten_pages(); pagedir_nosave = pblist; } else { - pagedir_nosave = NULL; handle->pbe = NULL; - nr_copy_pages = 0; - nr_meta_pages = 0; + swsusp_free(); } return error; } +static void *get_buffer(struct snapshot_handle *handle) +{ + struct pbe *pbe = handle->pbe, *last = handle->last_pbe; + struct page *page = virt_to_page(pbe->orig_address); + + if (PageNosave(page) && PageNosaveFree(page)) { + /* + * We have allocated the "original" page frame and we can + * use it directly to store the read page + */ + pbe->address = 0; + if (last && last->next) + last->next = NULL; + return (void *)pbe->orig_address; + } + /* + * The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the read page + */ + pbe->address = (unsigned long)safe_pages; + safe_pages = safe_pages->next; + if (last) + last->next = pbe; + handle->last_pbe = pbe; + return (void *)pbe->address; +} + /** * snapshot_write_next - used for writing the system memory snapshot. * @@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) } else if (handle->prev <= nr_meta_pages) { handle->pbe = unpack_orig_addresses(buffer, handle->pbe); if (!handle->pbe) { - error = create_image(handle); + error = prepare_image(handle); if (error) return error; handle->pbe = pagedir_nosave; - handle->buffer = (void *)handle->pbe->address; + handle->last_pbe = NULL; + handle->buffer = get_buffer(handle); } } else { handle->pbe = handle->pbe->next; - handle->buffer = (void *)handle->pbe->address; + handle->buffer = get_buffer(handle); } handle->prev = handle->page; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c4016cbbd3e0..17f669c83012 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void); int save_highmem(void); int restore_highmem(void); #else -static int save_highmem(void) { return 0; } -static int restore_highmem(void) { return 0; } -static unsigned int count_highmem_pages(void) { return 0; } +static inline int save_highmem(void) { return 0; } +static inline int restore_highmem(void) { return 0; } +static inline unsigned int count_highmem_pages(void) { return 0; } #endif /** @@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) */ #define SHRINK_BITE 10000 +static inline unsigned long __shrink_memory(long tmp) +{ + if (tmp > SHRINK_BITE) + tmp = SHRINK_BITE; + return shrink_all_memory(tmp); +} int swsusp_shrink_memory(void) { @@ -192,15 +198,17 @@ int swsusp_shrink_memory(void) PAGES_FOR_IO; tmp = size; for_each_zone (zone) - if (!is_highmem(zone)) + if (!is_highmem(zone) && populated_zone(zone)) { tmp -= zone->free_pages; + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + } if (tmp > 0) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(tmp); if (!tmp) return -ENOMEM; pages += tmp; } else if (size > image_size / PAGE_SIZE) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); pages += tmp; } printk("\b%c", p[i++%4]); diff --git a/kernel/printk.c b/kernel/printk.c index 19a955619294..95b7fe17f124 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -24,6 +24,7 @@ #include <linux/console.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/interrupt.h> /* For in_interrupt() */ #include <linux/config.h> #include <linux/delay.h> @@ -327,7 +328,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end) struct console *con; for (con = console_drivers; con; con = con->next) { - if ((con->flags & CON_ENABLED) && con->write) + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) con->write(con, &LOG_BUF(start), end - start); } } @@ -437,6 +440,7 @@ static int printk_time = 1; #else static int printk_time = 0; #endif +module_param(printk_time, int, S_IRUGO | S_IWUSR); static int __init printk_time_setup(char *str) { @@ -453,6 +457,18 @@ __attribute__((weak)) unsigned long long printk_clock(void) return sched_clock(); } +/* Check if we have any console registered that can be called early in boot. */ +static int have_callable_console(void) +{ + struct console *con; + + for (con = console_drivers; con; con = con->next) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + /** * printk - print a kernel message * @fmt: format string @@ -566,27 +582,29 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id())) { + if (!down_trylock(&console_sem)) { /* - * Some console drivers may assume that per-cpu resources have - * been allocated. So don't allow them to be called by this - * CPU until it is officially up. We shouldn't be calling into - * random console drivers on a CPU which doesn't exist yet.. + * We own the drivers. We can drop the spinlock and + * let release_console_sem() print the text, maybe ... */ + console_locked = 1; printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); - goto out; - } - if (!down_trylock(&console_sem)) { - console_locked = 1; + /* - * We own the drivers. We can drop the spinlock and let - * release_console_sem() print the text + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. */ - printk_cpu = UINT_MAX; - spin_unlock_irqrestore(&logbuf_lock, flags); - console_may_schedule = 0; - release_console_sem(); + if (cpu_online(smp_processor_id()) || have_callable_console()) { + console_may_schedule = 0; + release_console_sem(); + } else { + /* Release by hand to avoid flushing the buffer. */ + console_locked = 0; + up(&console_sem); + } } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -596,7 +614,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } -out: + preempt_enable(); return printed_len; } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2058f88c7bbb..20e9710fc21c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -612,14 +612,6 @@ void synchronize_rcu(void) wait_for_completion(&rcu.completion); } -/* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. - */ -void synchronize_kernel(void) -{ - synchronize_rcu(); -} - module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); @@ -627,7 +619,6 @@ module_param(qlowmark, int, 0); module_param(rsinterval, int, 0); #endif EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ -EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ +EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_bh); EXPORT_SYMBOL_GPL(synchronize_rcu); -EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ diff --git a/kernel/sched.c b/kernel/sched.c index c13f1bd2df7d..f06d059edef5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3886,6 +3886,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) !capable(CAP_SYS_NICE)) goto out_unlock; + retval = security_task_setscheduler(p, 0, NULL); + if (retval) + goto out_unlock; + cpus_allowed = cpuset_cpus_allowed(p); cpus_and(new_mask, new_mask, cpus_allowed); retval = set_cpus_allowed(p, new_mask); @@ -3954,7 +3958,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (!p) goto out_unlock; - retval = 0; + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + cpus_and(*mask, p->cpus_allowed, cpu_online_map); out_unlock: @@ -4046,6 +4053,9 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP + __might_sleep(__FILE__, __LINE__); +#endif /* * The BKS might be reacquired before we have dropped * PREEMPT_ACTIVE, which could trigger a second @@ -4142,7 +4152,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); atomic_inc(&rq->nr_iowait); schedule(); @@ -4153,7 +4163,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); long ret; atomic_inc(&rq->nr_iowait); @@ -4746,6 +4756,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!cpu_rq(cpu)->migration_thread) + break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, any_online_cpu(cpu_online_map)); diff --git a/kernel/softirq.c b/kernel/softirq.c index 336f92d64e2e..9e2f1c6e73d7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(ksoftirqd, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf02909..b5c3b94e01ce 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { void touch_softlockup_watchdog(void) { - per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; + __raw_get_cpu_var(touch_timestamp) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(watchdog_task, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), any_online_cpu(cpu_online_map)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d731466..2c0aacc37c55 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/syscalls.h> +#include <linux/kthread.h> #include <asm/atomic.h> #include <asm/semaphore.h> #include <asm/uaccess.h> @@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; static DECLARE_MUTEX(stopmachine_mutex); -static int stopmachine(void *cpu) +static int stopmachine(void *unused) { int irqs_disabled = 0; int prepared = 0; - set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); - /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ atomic_inc(&stopmachine_thread_ack); @@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state) static int stop_machine(void) { - int i, ret = 0; + int ret = 0; + unsigned int i; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* One high-prio thread per cpu. We'll do this one. */ @@ -96,11 +96,16 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { + struct task_struct *tsk; if (i == raw_smp_processor_id()) continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) + tsk = kthread_create(stopmachine, NULL, "stopmachine"); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); break; + } + kthread_bind(tsk, i); + wake_up_process(tsk); stopmachine_num_threads++; } diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936f..2d5179c67cec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -13,7 +13,6 @@ #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> -#include <linux/init.h> #include <linux/highuid.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -57,6 +56,12 @@ #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a,b) (-EINVAL) #endif +#ifndef GET_ENDIAN +# define GET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef SET_ENDIAN +# define SET_ENDIAN(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -132,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v) { int ret = NOTIFY_DONE; - struct notifier_block *nb; + struct notifier_block *nb, *next_nb; nb = rcu_dereference(*nl); while (nb) { + next_nb = rcu_dereference(nb->next); ret = nb->notifier_call(nb, val, v); if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) break; - nb = rcu_dereference(nb->next); + nb = next_nb; } return ret; } @@ -583,7 +589,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -void kernel_restart_prepare(char *cmd) +static void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; @@ -617,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ -void kernel_kexec(void) +static void kernel_kexec(void) { #ifdef CONFIG_KEXEC struct kimage *image; @@ -631,7 +637,6 @@ void kernel_kexec(void) machine_kexec(image); #endif } -EXPORT_SYMBOL_GPL(kernel_kexec); void kernel_shutdown_prepare(enum system_states state) { @@ -1860,23 +1865,20 @@ out: * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * - * tasklist_lock locking optimisation: - * If we are current and single threaded, we do not need to take the tasklist - * lock or the siglock. No one else can take our signal_struct away, - * no one else can reap the children to update signal->c* counters, and - * no one else can race with the signal-> fields. - * If we do not take the tasklist_lock, the signal-> fields could be read - * out of order while another thread was just exiting. So we place a - * read memory barrier when we avoid the lock. On the writer side, - * write memory barrier is implied in __exit_signal as __exit_signal releases - * the siglock spinlock after updating the signal-> fields. - * - * We don't really need the siglock when we access the non c* fields - * of the signal_struct (for RUSAGE_SELF) even in multithreaded - * case, since we take the tasklist lock for read and the non c* signal-> - * fields are updated only in __exit_signal, which is called with - * tasklist_lock taken for write, hence these two threads cannot execute - * concurrently. + * Locking: + * We need to take the siglock for CHILDEREN, SELF and BOTH + * for the cases current multithreaded, non-current single threaded + * non-current multithreaded. Thread traversal is now safe with + * the siglock held. + * Strictly speaking, we donot need to take the siglock if we are current and + * single threaded, as no one else can take our signal_struct away, no one + * else can reap the children to update signal->c* counters, and no one else + * can race with the signal-> fields. If we do not take any lock, the + * signal-> fields could be read out of order while another thread was just + * exiting. So we should place a read memory barrier when we avoid the lock. + * On the writer side, write memory barrier is implied in __exit_signal + * as __exit_signal releases the siglock spinlock after updating the signal-> + * fields. But we don't do this yet to keep things simple. * */ @@ -1885,35 +1887,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; - int need_lock = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; - if (p != current || !thread_group_empty(p)) - need_lock = 1; - - if (need_lock) { - read_lock(&tasklist_lock); - if (unlikely(!p->signal)) { - read_unlock(&tasklist_lock); - return; - } - } else - /* See locking comments above */ - smp_rmb(); + rcu_read_lock(); + if (!lock_task_sighand(p, &flags)) { + rcu_read_unlock(); + return; + } switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: - spin_lock_irqsave(&p->sighand->siglock, flags); utime = p->signal->cutime; stime = p->signal->cstime; r->ru_nvcsw = p->signal->cnvcsw; r->ru_nivcsw = p->signal->cnivcsw; r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; - spin_unlock_irqrestore(&p->sighand->siglock, flags); if (who == RUSAGE_CHILDREN) break; @@ -1941,8 +1933,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) BUG(); } - if (need_lock) - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); + rcu_read_unlock(); + cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } @@ -2057,6 +2050,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, return -EFAULT; return 0; } + case PR_GET_ENDIAN: + error = GET_ENDIAN(current, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(current, arg2); + break; + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f1..6991bece67e8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); cond_syscall(sys_chown16); cond_syscall(sys_fchown16); cond_syscall(sys_getegid16); @@ -132,3 +133,4 @@ cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); +cond_syscall(compat_sys_move_pages); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0d656e61621d..2c0e65819448 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, extern int C_A_D; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; +extern int sysctl_panic_on_oom; extern int max_threads; extern int sysrq_enabled; extern int core_uses_pid; @@ -142,7 +143,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; static ctl_table dev_table[]; @@ -202,12 +202,6 @@ static ctl_table root_table[] = { }, #endif { - .ctl_name = CTL_PROC, - .procname = "proc", - .mode = 0555, - .child = proc_table, - }, - { .ctl_name = CTL_FS, .procname = "fs", .mode = 0555, @@ -398,7 +392,7 @@ static ctl_table kern_table[] = { .strategy = &sysctl_string, }, #endif -#ifdef CONFIG_HOTPLUG +#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { .ctl_name = KERN_HOTPLUG, .procname = "hotplug", @@ -702,6 +696,14 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = VM_PANIC_ON_OOM, + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, @@ -918,10 +920,6 @@ static ctl_table vm_table[] = { { .ctl_name = 0 } }; -static ctl_table proc_table[] = { - { .ctl_name = 0 } -}; - static ctl_table fs_table[] = { { .ctl_name = FS_NRINODE, diff --git a/kernel/timer.c b/kernel/timer.c index 9e49deed468c..eb97371b87d8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); + timer->base = __raw_get_cpu_var(tvec_bases); } EXPORT_SYMBOL(init_timer); @@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(tvec_base_t *base, tvec_t *tv, int index) { /* cascade all the timers from tv up one level */ - struct list_head *head, *curr; + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); - head = tv->vec + index; - curr = head->next; /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. */ - while (curr != head) { - struct timer_list *tmp; - - tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != base); - curr = curr->next; - internal_add_timer(base, tmp); + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(timer->base != base); + internal_add_timer(base, timer); } - INIT_LIST_HEAD(head); return index; } @@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base) spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list = LIST_HEAD_INIT(work_list); + struct list_head work_list; struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; - + /* * Cascade timers: */ @@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base) (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_splice_init(base->tv1.vec + index, &work_list); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, &work_list); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; diff --git a/kernel/user.c b/kernel/user.c index 4b1eb745afa1..6408c0424291 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid) new->mq_bytes = 0; new->locked_shm = 0; - if (alloc_uid_keyring(new) < 0) { + if (alloc_uid_keyring(new, current) < 0) { kmem_cache_free(uid_cachep, new); return NULL; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 880fb415a8f6..565cf7a1febd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu, return ret; } -int schedule_on_each_cpu(void (*func) (void *info), void *info) +/** + * schedule_on_each_cpu - call a function on each online CPU from keventd + * @func: the function to call + * @info: a pointer to pass to func() + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * Appears to be racy against CPU hotplug. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu(void (*func)(void *info), void *info) { int cpu; - struct work_struct *work; + struct work_struct *works; - work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); - - if (!work) + works = alloc_percpu(struct work_struct); + if (!works) return -ENOMEM; + for_each_online_cpu(cpu) { - INIT_WORK(work + cpu, func, info); + INIT_WORK(per_cpu_ptr(works, cpu), func, info); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), - work + cpu); + per_cpu_ptr(works, cpu)); } flush_workqueue(keventd_wq); - kfree(work); + free_percpu(works); return 0; } @@ -531,11 +543,11 @@ int current_is_keventd(void) static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - LIST_HEAD(list); + struct list_head list; struct work_struct *work; spin_lock_irq(&cwq->lock); - list_splice_init(&cwq->worklist, &list); + list_replace_init(&cwq->worklist, &list); while (!list_empty(&list)) { printk("Taking work for %s\n", wq->name); @@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: list_for_each_entry(wq, &workqueues, list) { + if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) + continue; /* Unbind so it can run. */ kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, any_online_cpu(cpu_online_map)); |