diff options
author | Ingo Molnar <mingo@kernel.org> | 2014-08-24 22:32:24 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-08-24 22:32:24 +0200 |
commit | 83bc90e11576f9c100f8ef4ba2bcd0b89212e3fb (patch) | |
tree | e59186b4d315c80255851e0d204143ecc21399a0 /kernel | |
parent | Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/... (diff) | |
parent | Merge tag 'pwm/for-3.17-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff) | |
download | linux-83bc90e11576f9c100f8ef4ba2bcd0b89212e3fb.tar.xz linux-83bc90e11576f9c100f8ef4ba2bcd0b89212e3fb.zip |
Merge branch 'linus' into perf/core, to fix conflicts
Conflicts:
arch/x86/kernel/cpu/perf_event_intel_uncore*.c
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
67 files changed, 4265 insertions, 1717 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f2a8b6246ce9..dc5c77544fd6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,12 +3,11 @@ # obj-y = fork.o exec_domain.o panic.o \ - cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ + cpu.o exit.o softirq.o resource.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o nsproxy.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o smpboot.o @@ -87,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o +obj-$(CONFIG_NET) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ @@ -105,27 +105,11 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - ############################################################################### # # Roll all the X.509 certificates that we can find together and pull them into diff --git a/kernel/acct.c b/kernel/acct.c index 808a86ff229d..b4c667d22e79 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,6 +59,7 @@ #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ #include <linux/pid_namespace.h> +#include <linux/fs_pin.h> /* * These constants control the amount of freespace that suspend and @@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *); +static void do_acct_process(struct bsd_acct_struct *acct); -/* - * This structure is used so that all the data protected by lock - * can be placed in the same cache line as the lock. This primes - * the cache line to have the data after getting the lock. - */ struct bsd_acct_struct { + struct fs_pin pin; + struct mutex lock; int active; unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct list_head list; + struct work_struct work; + struct completion done; }; -static DEFINE_SPINLOCK(acct_lock); -static LIST_HEAD(acct_list); - /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct, struct file *file) +static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - int res; - int act; - u64 resume; - u64 suspend; - - spin_lock(&acct_lock); - res = acct->active; - if (!file || time_is_before_jiffies(acct->needcheck)) + + if (time_is_before_jiffies(acct->needcheck)) goto out; - spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(&file->f_path, &sbuf)) - return res; - suspend = sbuf.f_blocks * SUSPEND; - resume = sbuf.f_blocks * RESUME; - - do_div(suspend, 100); - do_div(resume, 100); - - if (sbuf.f_bavail <= suspend) - act = -1; - else if (sbuf.f_bavail >= resume) - act = 1; - else - act = 0; - - /* - * If some joker switched acct->file under us we'ld better be - * silent and _not_ touch anything. - */ - spin_lock(&acct_lock); - if (file != acct->file) { - if (act) - res = act > 0; + if (vfs_statfs(&acct->file->f_path, &sbuf)) goto out; - } if (acct->active) { - if (act < 0) { + u64 suspend = sbuf.f_blocks * SUSPEND; + do_div(suspend, 100); + if (sbuf.f_bavail <= suspend) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { - if (act > 0) { + u64 resume = sbuf.f_blocks * RESUME; + do_div(resume, 100); + if (sbuf.f_bavail >= resume) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - res = acct->active; out: - spin_unlock(&acct_lock); + return acct->active; +} + +static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) +{ + struct bsd_acct_struct *res; +again: + smp_rmb(); + rcu_read_lock(); + res = ACCESS_ONCE(ns->bacct); + if (!res) { + rcu_read_unlock(); + return NULL; + } + if (!atomic_long_inc_not_zero(&res->pin.count)) { + rcu_read_unlock(); + cpu_relax(); + goto again; + } + rcu_read_unlock(); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + pin_put(&res->pin); + goto again; + } return res; } -/* - * Close the old accounting file (if currently open) and then replace - * it with file (if non-NULL). - * - * NOTE: acct_lock MUST be held on entry and exit. - */ -static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, - struct pid_namespace *ns) +static void close_work(struct work_struct *work) +{ + struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); + struct file *file = acct->file; + if (file->f_op->flush) + file->f_op->flush(file, NULL); + __fput_sync(file); + complete(&acct->done); +} + +static void acct_kill(struct bsd_acct_struct *acct, + struct bsd_acct_struct *new) { - struct file *old_acct = NULL; - struct pid_namespace *old_ns = NULL; - - if (acct->file) { - old_acct = acct->file; - old_ns = acct->ns; - acct->active = 0; - acct->file = NULL; + if (acct) { + struct pid_namespace *ns = acct->ns; + do_acct_process(acct); + INIT_WORK(&acct->work, close_work); + init_completion(&acct->done); + schedule_work(&acct->work); + wait_for_completion(&acct->done); + pin_remove(&acct->pin); + ns->bacct = new; acct->ns = NULL; - list_del(&acct->list); + atomic_long_dec(&acct->pin.count); + mutex_unlock(&acct->lock); + pin_put(&acct->pin); } - if (file) { - acct->file = file; - acct->ns = ns; - acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - acct->active = 1; - list_add(&acct->list, &acct_list); - } - if (old_acct) { - mnt_unpin(old_acct->f_path.mnt); - spin_unlock(&acct_lock); - do_acct_process(acct, old_ns, old_acct); - filp_close(old_acct, NULL); - spin_lock(&acct_lock); +} + +static void acct_pin_kill(struct fs_pin *pin) +{ + struct bsd_acct_struct *acct; + acct = container_of(pin, struct bsd_acct_struct, pin); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + pin_put(pin); + acct = NULL; } + acct_kill(acct, NULL); } static int acct_on(struct filename *pathname) { struct file *file; - struct vfsmount *mnt; - struct pid_namespace *ns; - struct bsd_acct_struct *acct = NULL; + struct vfsmount *mnt, *internal; + struct pid_namespace *ns = task_active_pid_ns(current); + struct bsd_acct_struct *acct, *old; + int err; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; /* Difference from BSD - they don't do O_APPEND */ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) + if (IS_ERR(file)) { + kfree(acct); return PTR_ERR(file); + } if (!S_ISREG(file_inode(file)->i_mode)) { + kfree(acct); filp_close(file, NULL); return -EACCES; } if (!file->f_op->write) { + kfree(acct); filp_close(file, NULL); return -EIO; } - - ns = task_active_pid_ns(current); - if (ns->bacct == NULL) { - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (acct == NULL) { - filp_close(file, NULL); - return -ENOMEM; - } + internal = mnt_clone_internal(&file->f_path); + if (IS_ERR(internal)) { + kfree(acct); + filp_close(file, NULL); + return PTR_ERR(internal); } - - spin_lock(&acct_lock); - if (ns->bacct == NULL) { - ns->bacct = acct; - acct = NULL; + err = mnt_want_write(internal); + if (err) { + mntput(internal); + kfree(acct); + filp_close(file, NULL); + return err; } - mnt = file->f_path.mnt; - mnt_pin(mnt); - acct_file_reopen(ns->bacct, file, ns); - spin_unlock(&acct_lock); - - mntput(mnt); /* it's pinned, now give up active reference */ - kfree(acct); - + file->f_path.mnt = internal; + + atomic_long_set(&acct->pin.count, 1); + acct->pin.kill = acct_pin_kill; + acct->file = file; + acct->needcheck = jiffies; + acct->ns = ns; + mutex_init(&acct->lock); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ + pin_insert(&acct->pin, mnt); + + old = acct_get(ns); + if (old) + acct_kill(old, acct); + else + ns->bacct = acct; + mutex_unlock(&acct->lock); + mnt_drop_write(mnt); + mntput(mnt); return 0; } +static DEFINE_MUTEX(acct_on_mutex); + /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting @@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); + mutex_lock(&acct_on_mutex); error = acct_on(tmp); + mutex_unlock(&acct_on_mutex); putname(tmp); } else { - struct bsd_acct_struct *acct; - - acct = task_active_pid_ns(current)->bacct; - if (acct == NULL) - return 0; - - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); + acct_kill(acct_get(task_active_pid_ns(current)), NULL); } return error; } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.dentry->d_sb == sb) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct = ns->bacct; - - if (acct == NULL) - return; - - spin_lock(&acct_lock); - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - - kfree(acct); + acct_kill(acct_get(ns), NULL); } /* @@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ +#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */ static comp2_t encode_comp2_t(u64 value) { @@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value) } #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 /* * encode an u64 into a 32 bit IEEE float */ @@ -429,8 +391,9 @@ static u32 encode_float(u64 value) unsigned exp = 190; unsigned u; - if (value==0) return 0; - while ((s64)value > 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -448,120 +411,112 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void fill_ac(acct_t *ac) { struct pacct_struct *pacct = ¤t->signal->pacct; - acct_t ac; - mm_segment_t fs; - unsigned long flim; - u64 elapsed; - u64 run_time; - struct timespec uptime; + u64 elapsed, run_time; struct tty_struct *tty; - const struct cred *orig_cred; - - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct, file)) - goto out; /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset(&ac, 0, sizeof(acct_t)); + memset(ac, 0, sizeof(acct_t)); - ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); + ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; + strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ - do_posix_clock_monotonic_gettime(&uptime); - run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; - run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC - + current->group_leader->start_time.tv_nsec; + run_time = ktime_get_ns(); + run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 - ac.ac_etime = encode_float(elapsed); +#if ACCT_VERSION == 3 + ac->ac_etime = encode_float(elapsed); #else - ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); - ac.ac_etime_hi = etime >> 16; - ac.ac_etime_lo = (u16) etime; + + ac->ac_etime_hi = etime >> 16; + ac->ac_etime_lo = (u16) etime; } #endif do_div(elapsed, AHZ); - ac.ac_btime = get_seconds() - elapsed; + ac->ac_btime = get_seconds() - elapsed; +#if ACCT_VERSION==2 + ac->ac_ahz = AHZ; +#endif + + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ + ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; + ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac->ac_flag = pacct->ac_flag; + ac->ac_mem = encode_comp_t(pacct->ac_mem); + ac->ac_minflt = encode_comp_t(pacct->ac_minflt); + ac->ac_majflt = encode_comp_t(pacct->ac_majflt); + ac->ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); +} +/* + * do_acct_process does all actual work. Caller holds the reference to file. + */ +static void do_acct_process(struct bsd_acct_struct *acct) +{ + acct_t ac; + unsigned long flim; + const struct cred *orig_cred; + struct pid_namespace *ns = acct->ns; + struct file *file = acct->file; + + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); + + /* + * First check to see if there is enough free_space to continue + * the process accounting system. + */ + if (!check_free_space(acct)) + goto out; + + fill_ac(&ac); /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 - ac.ac_ahz = AHZ; -#endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); #endif - - spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; /* Safe as we hold the siglock */ - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); - ac.ac_flag = pacct->ac_flag; - ac.ac_mem = encode_comp_t(pacct->ac_mem); - ac.ac_minflt = encode_comp_t(pacct->ac_minflt); - ac.ac_majflt = encode_comp_t(pacct->ac_majflt); - ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock_irq(¤t->sighand->siglock); - ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ - ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_swaps = encode_comp_t(0); - /* * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. */ - if (!file_start_write_trylock(file)) - goto out; - /* - * Kernel segment override to datasegment and write it - * to the accounting file. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - file->f_op->write(file, (char *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); - file_end_write(file); + if (file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + file_end_write(file); + } out: + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; revert_creds(orig_cred); } @@ -578,6 +533,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { @@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead) spin_unlock_irq(¤t->sighand->siglock); } -static void acct_process_in_ns(struct pid_namespace *ns) +static void slow_acct_process(struct pid_namespace *ns) { - struct file *file = NULL; - struct bsd_acct_struct *acct; - - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - return; - - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { - spin_unlock(&acct_lock); - return; + for ( ; ns; ns = ns->parent) { + struct bsd_acct_struct *acct = acct_get(ns); + if (acct) { + do_acct_process(acct); + mutex_unlock(&acct->lock); + pin_put(&acct->pin); + } } - get_file(file); - spin_unlock(&acct_lock); - - do_acct_process(acct, ns, file); - fput(file); } /** - * acct_process - now just a wrapper around acct_process_in_ns, - * which in turn is a wrapper around do_acct_process. + * acct_process * * handles process accounting for an exiting task */ @@ -649,6 +591,10 @@ void acct_process(void) * alive and holds its namespace, which in turn holds * its parent. */ - for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) - acct_process_in_ns(ns); + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { + if (ns->bacct) + break; + } + if (unlikely(ns)) + slow_acct_process(ns); } diff --git a/kernel/audit.c b/kernel/audit.c index 3ef2e0e797e8..ba2ff5a5c600 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) audit_log_format(ab, " %s=", prefix); CAP_FOR_EACH_U32(i) { audit_log_format(ab, "%08x", - cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + cap->cap[CAP_LAST_U32 - i]); } } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..c447cd9848d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246b04b8..e1d1d1952bfa 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> -#include <linux/page_cgroup.h> #include <linux/log2.h> #include <linux/spinlock_types.h> @@ -18,7 +17,6 @@ void foo(void) /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile new file mode 100644 index 000000000000..6a71145e2769 --- /dev/null +++ b/kernel/bpf/Makefile @@ -0,0 +1 @@ +obj-y := core.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c new file mode 100644 index 000000000000..7f0dbcbb34af --- /dev/null +++ b/kernel/bpf/core.c @@ -0,0 +1,534 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: + * + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + * Jay Schulist <jschlst@samba.org> + * Alexei Starovoitov <ast@plumgrid.com> + * Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in bpf_check_classic() + */ +#include <linux/filter.h> +#include <linux/skbuff.h> +#include <asm/unaligned.h> + +/* Registers */ +#define BPF_R0 regs[BPF_REG_0] +#define BPF_R1 regs[BPF_REG_1] +#define BPF_R2 regs[BPF_REG_2] +#define BPF_R3 regs[BPF_REG_3] +#define BPF_R4 regs[BPF_REG_4] +#define BPF_R5 regs[BPF_REG_5] +#define BPF_R6 regs[BPF_REG_6] +#define BPF_R7 regs[BPF_REG_7] +#define BPF_R8 regs[BPF_REG_8] +#define BPF_R9 regs[BPF_REG_9] +#define BPF_R10 regs[BPF_REG_10] + +/* Named registers */ +#define DST regs[insn->dst_reg] +#define SRC regs[insn->src_reg] +#define FP regs[BPF_REG_FP] +#define ARG1 regs[BPF_REG_ARG1] +#define CTX regs[BPF_REG_CTX] +#define IMM insn->imm + +/* No hurry in this branch + * + * Exported for the bpf jit load helper. + */ +void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb_network_header(skb) + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) + return ptr; + + return NULL; +} + +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + +/** + * __bpf_prog_run - run eBPF program on a given context + * @ctx: is the data we are operating on + * @insn: is the array of eBPF instructions + * + * Decode and execute eBPF instructions. + */ +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +{ + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG], tmp; + static const void *jumptable[256] = { + [0 ... 255] = &&default_label, + /* Now overwrite non-defaults ... */ + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, + [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, + [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, + [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, + [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, + [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, + [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, + [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, + [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, + [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, + [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, + [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, + [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, + [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, + [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, + [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, + [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, + [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, + [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, + [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, + [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, + [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, + [BPF_ALU | BPF_NEG] = &&ALU_NEG, + [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, + [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, + /* 64 bit ALU operations */ + [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, + [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, + [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, + [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, + [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, + [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, + [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, + [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, + [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, + [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, + [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, + [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, + [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, + [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, + [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, + [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, + [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, + [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, + [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, + [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, + [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, + [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, + [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, + [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, + [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, + /* Call instruction */ + [BPF_JMP | BPF_CALL] = &&JMP_CALL, + /* Jumps */ + [BPF_JMP | BPF_JA] = &&JMP_JA, + [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, + [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, + [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, + [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, + [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, + [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, + [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, + [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, + [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, + [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, + /* Program return */ + [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, + /* Store instructions */ + [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, + [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, + [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, + [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, + [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, + [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, + [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, + [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, + [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, + [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, + /* Load instructions */ + [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, + [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, + [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, + [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, + [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, + [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, + [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, + [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, + [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, + [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + }; + void *ptr; + int off; + +#define CONT ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + + /* Registers used in classic BPF programs need to be reset first. */ + regs[BPF_REG_A] = 0; + regs[BPF_REG_X] = 0; + +select_insn: + goto *jumptable[insn->code]; + + /* ALU */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + + ALU(ADD, +) + ALU(SUB, -) + ALU(AND, &) + ALU(OR, |) + ALU(LSH, <<) + ALU(RSH, >>) + ALU(XOR, ^) + ALU(MUL, *) +#undef ALU + ALU_NEG: + DST = (u32) -DST; + CONT; + ALU64_NEG: + DST = -DST; + CONT; + ALU_MOV_X: + DST = (u32) SRC; + CONT; + ALU_MOV_K: + DST = (u32) IMM; + CONT; + ALU64_MOV_X: + DST = SRC; + CONT; + ALU64_MOV_K: + DST = IMM; + CONT; + ALU64_ARSH_X: + (*(s64 *) &DST) >>= SRC; + CONT; + ALU64_ARSH_K: + (*(s64 *) &DST) >>= IMM; + CONT; + ALU64_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = DST; + DST = do_div(tmp, SRC); + CONT; + ALU_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); + CONT; + ALU64_MOD_K: + tmp = DST; + DST = do_div(tmp, IMM); + CONT; + ALU_MOD_K: + tmp = (u32) DST; + DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: + if (unlikely(SRC == 0)) + return 0; + do_div(DST, SRC); + CONT; + ALU_DIV_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; + CONT; + ALU64_DIV_K: + do_div(DST, IMM); + CONT; + ALU_DIV_K: + tmp = (u32) DST; + do_div(tmp, (u32) IMM); + DST = (u32) tmp; + CONT; + ALU_END_TO_BE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_be16(DST); + break; + case 32: + DST = (__force u32) cpu_to_be32(DST); + break; + case 64: + DST = (__force u64) cpu_to_be64(DST); + break; + } + CONT; + ALU_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_le16(DST); + break; + case 32: + DST = (__force u32) cpu_to_le32(DST); + break; + case 64: + DST = (__force u64) cpu_to_le64(DST); + break; + } + CONT; + + /* CALL */ + JMP_CALL: + /* Function call scratches BPF_R1-BPF_R5 registers, + * preserves BPF_R6-BPF_R9, and stores return value + * into BPF_R0. + */ + BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5); + CONT; + + /* JMP */ + JMP_JA: + insn += insn->off; + CONT; + JMP_JEQ_X: + if (DST == SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JEQ_K: + if (DST == IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_X: + if (DST != SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_K: + if (DST != IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_X: + if (DST > SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_K: + if (DST > IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_X: + if (DST >= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_K: + if (DST >= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_X: + if (((s64) DST) > ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_K: + if (((s64) DST) > ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_X: + if (((s64) DST) >= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_K: + if (((s64) DST) >= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_X: + if (DST & SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_K: + if (DST & IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_EXIT: + return BPF_R0; + + /* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ + CONT; \ + ST_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ + CONT; \ + LDX_MEM_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; + + LDST(B, u8) + LDST(H, u16) + LDST(W, u32) + LDST(DW, u64) +#undef LDST + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + CONT; + STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + CONT; + LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ + off = IMM; +load_word: + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are + * only appearing in the programs where ctx == + * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] + * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, + * internal BPF verifier will check that BPF_R6 == + * ctx. + * + * BPF_ABS and BPF_IND are wrappers of function calls, + * so they scratch BPF_R1-BPF_R5 registers, preserve + * BPF_R6-BPF_R9, and store return value into BPF_R0. + * + * Implicit input: + * ctx == skb == BPF_R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness + */ + + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be32(ptr); + CONT; + } + + return 0; + LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ + off = IMM; +load_half: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be16(ptr); + CONT; + } + + return 0; + LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ + off = IMM; +load_byte: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = *(u8 *)ptr; + CONT; + } + + return 0; + LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_word; + LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_half; + LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ + off = IMM + SRC; + goto load_byte; + + default_label: + /* If we ever reach this, we have a bug somewhere. */ + WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + return 0; +} + +void __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ +} + +/** + * bpf_prog_select_runtime - select execution runtime for BPF program + * @fp: bpf_prog populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via BPF_PROG_RUN() macro + */ +void bpf_prog_select_runtime(struct bpf_prog *fp) +{ + fp->bpf_func = (void *) __bpf_prog_run; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); + +/* free internal BPF program */ +void bpf_prog_free(struct bpf_prog *fp) +{ + bpf_jit_free(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/capability.c b/kernel/capability.c index a5cf13c018ce..989f5bfc57dc 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) i++; } + effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + new = prepare_creds(); if (!new) return -ENOMEM; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2f7c760305ca..379650b984f8 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) static void kdb_sysinfo(struct sysinfo *val) { struct timespec uptime; - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); memset(val, 0, sizeof(*val)); val->uptime = uptime.tv_sec; val->loads[0] = avenrun[0]; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 54996b71e66d..ef90b04d783f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk) } /* - * Start accounting for a delay statistic using - * its starting timestamp (@start) + * Finish delay accounting for a statistic using its timestamps (@start), + * accumalator (@total) and @count */ - -static inline void delayacct_start(struct timespec *start) +static void delayacct_end(u64 *start, u64 *total, u32 *count) { - do_posix_clock_monotonic_gettime(start); -} - -/* - * Finish delay accounting for a statistic using - * its timestamps (@start, @end), accumalator (@total) and @count - */ - -static void delayacct_end(struct timespec *start, struct timespec *end, - u64 *total, u32 *count) -{ - struct timespec ts; - s64 ns; + s64 ns = ktime_get_ns() - *start; unsigned long flags; - do_posix_clock_monotonic_gettime(end); - ts = timespec_sub(*end, *start); - ns = timespec_to_ns(&ts); - if (ns < 0) - return; - - spin_lock_irqsave(¤t->delays->lock, flags); - *total += ns; - (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + if (ns > 0) { + spin_lock_irqsave(¤t->delays->lock, flags); + *total += ns; + (*count)++; + spin_unlock_irqrestore(¤t->delays->lock, flags); + } } void __delayacct_blkio_start(void) { - delayacct_start(¤t->delays->blkio_start); + current->delays->blkio_start = ktime_get_ns(); } void __delayacct_blkio_end(void) @@ -89,35 +72,29 @@ void __delayacct_blkio_end(void) if (current->delays->flags & DELAYACCT_PF_SWAPIN) /* Swapin block I/O */ delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, ¤t->delays->swapin_delay, ¤t->delays->swapin_count); else /* Other block I/O */ delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, ¤t->delays->blkio_delay, ¤t->delays->blkio_count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { - s64 tmp; - unsigned long t1; - unsigned long long t2, t3; - unsigned long flags; - struct timespec ts; cputime_t utime, stime, stimescaled, utimescaled; + unsigned long long t2, t3; + unsigned long flags, t1; + s64 tmp; - tmp = (s64)d->cpu_run_real_total; task_cputime(tsk, &utime, &stime); - cputime_to_timespec(utime + stime, &ts); - tmp += timespec_to_ns(&ts); + tmp = (s64)d->cpu_run_real_total; + tmp += cputime_to_nsecs(utime + stime); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; - tmp = (s64)d->cpu_scaled_run_real_total; task_cputime_scaled(tsk, &utimescaled, &stimescaled); - cputime_to_timespec(utimescaled + stimescaled, &ts); - tmp += timespec_to_ns(&ts); + tmp = (s64)d->cpu_scaled_run_real_total; + tmp += cputime_to_nsecs(utimescaled + stimescaled); d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; @@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) void __delayacct_freepages_start(void) { - delayacct_start(¤t->delays->freepages_start); + current->delays->freepages_start = ktime_get_ns(); } void __delayacct_freepages_end(void) { delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_end, ¤t->delays->freepages_delay, ¤t->delays->freepages_count); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e8c137..1d0af8a2c646 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -315,18 +323,11 @@ retry: if (!new_page) goto put_old; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - -put_new: page_cache_release(new_page); put_old: put_page(old_page); diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..32c58f7433a3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@ #include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +192,8 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -405,7 +407,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk) core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + clear_thread_flag(TIF_MEMDIE); } /* @@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father) list_for_each_entry_safe(p, n, &father->children, sibling) { struct task_struct *t = p; + do { t->real_parent = reaper; if (t->parent == father) { @@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* * This does two things: * - * A. Make init inherit all the child processes + * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) @@ -648,9 +653,8 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -691,8 +695,7 @@ void do_exit(long code) * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -716,9 +719,9 @@ void do_exit(long code) raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ @@ -836,7 +839,6 @@ void do_exit(long code) for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code) do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -869,6 +870,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); @@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 962885edbe53..0cf9cdb6e491 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto free_ti; tsk->stack = ti; +#ifdef CONFIG_SECCOMP + /* + * We must handle setting up seccomp filters once we're under + * the sighand lock in case orig has changed between now and + * then. Until then, filter must be NULL to avoid messing up + * the usage counts on the error path calling free_task. + */ + tsk->seccomp.filter = NULL; +#endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); @@ -365,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -421,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) @@ -527,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -549,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + return mm; + +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -587,7 +619,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -819,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -851,15 +875,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@ -1081,6 +1096,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } +static void copy_seccomp(struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + /* + * Must be called with sighand->lock held, which is common to + * all threads in the group. Holding cred_guard_mutex is not + * needed because this new task is not yet running and cannot + * be racing exec. + */ + assert_spin_locked(¤t->sighand->siglock); + + /* Ref-count the new filter user, and assign it. */ + get_seccomp_filter(current); + p->seccomp = current->seccomp; + + /* + * Explicitly enable no_new_privs here in case it got set + * between the task_struct being duplicated and holding the + * sighand lock. The seccomp state and nnp must be in sync. + */ + if (task_no_new_privs(current)) + task_set_no_new_privs(p); + + /* + * If the parent gained a seccomp mode after copying thread + * flags and between before we held the sighand lock, we have + * to manually enable the seccomp thread flag here. + */ + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) + set_tsk_thread_flag(p, TIF_SECCOMP); +#endif +} + SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1098,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ @@ -1195,7 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); - get_seccomp_filter(p); rt_mutex_init_task(p); @@ -1261,9 +1301,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - do_posix_clock_monotonic_gettime(&p->start_time); - p->real_start_time = p->start_time; - monotonic_to_bootbased(&p->real_start_time); + p->start_time = ktime_get_ns(); + p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) @@ -1306,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -1327,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1436,6 +1472,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock(¤t->sighand->siglock); /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to @@ -1872,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a76379..edf67c493a8e 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) err_remove: pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); + debugfs_remove(root_node.dentry); return rc; } diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 452d6f2ba21d..cf80e7b0ddab 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain */ -static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, - irq_hw_number_t hw_irq) +int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) { struct irq_data *data = irq_get_irq_data(virq); struct irq_domain_chip_generic *dgc = d->gc; @@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } +EXPORT_SYMBOL_GPL(irq_map_generic_chip); struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb5e10e32e05..6534ff6ce02e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) +void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37dac3a..ae5167087845 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) - return sprintf(buffer, "0x%lx", address); + return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c925884..0b49a0a58102 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> @@ -40,6 +42,9 @@ #include <asm/io.h> #include <asm/sections.h> +#include <crypto/hash.h> +#include <crypto/sha.h> + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: - return result; + return ret; } -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { - int result; - struct kimage *image; - unsigned long i; + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; - image = NULL; - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; goto out; } - /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; goto out; + } - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; + if (bytes == 0) + break; + pos += bytes; } + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. */ - result = -ENOMEM; + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: kfree(image); -out: - return result; + return ret; } static int kimage_is_destination_range(struct kimage *image, @@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; @@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, /* Loading another kernel to reboot into */ if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); } if (result) @@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +static int __kexec_add_segment(struct kimage *image, char *buf, + unsigned long bufsz, unsigned long mem, + unsigned long memsz) +{ + struct kexec_segment *ksegment; + + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = buf; + ksegment->bufsz = bufsz; + ksegment->mem = mem; + ksegment->memsz = memsz; + image->nr_segments++; + + return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments - 1]; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective archicture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/module.c b/kernel/module.c index ae79ce615cb9..03214bd288e9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); + + /* we can't deallocate the module until we clear memory protection */ + unset_module_init_ro_nx(mod); + unset_module_core_ro_nx(mod); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); @@ -3381,6 +3386,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size) */ static inline int is_arm_mapping_symbol(const char *str) { + if (str[0] == '.' && str[1] == 'L') + return true; return str[0] == '$' && strchr("atd", str[1]) && (str[2] == '\0' || str[2] == '.'); } @@ -3444,8 +3451,7 @@ const char *module_address_lookup(unsigned long addr, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -3469,8 +3475,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -3495,8 +3500,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -3760,8 +3764,7 @@ struct module *__module_address(unsigned long addr) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_core(addr, mod) - || within_module_init(addr, mod)) + if (within_module(addr, mod)) return mod; } return NULL; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e7811086b82..ef42d0ab3115 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + task_lock(p); ns = p->nsproxy; + p->nsproxy = new; + task_unlock(p); - rcu_assign_pointer(p->nsproxy, new); - - if (ns && atomic_dec_and_test(&ns->count)) { - /* - * wait for others to get what they want from this nsproxy. - * - * cannot release this nsproxy via the call_rcu() since - * put_mnt_ns() will want to sleep - */ - synchronize_rcu(); + if (ns && atomic_dec_and_test(&ns->count)) free_nsproxy(ns); - } } void exit_task_namespaces(struct task_struct *p) diff --git a/kernel/panic.c b/kernel/panic.c index 62e16cef9cc2..d09dc5c32c67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -224,6 +224,7 @@ static const struct tnt tnts[] = { { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, + { TAINT_SOFTLOCKUP, 'L', ' ' }, }; /** diff --git a/kernel/params.c b/kernel/params.c index 1e52ca233fd9..34f527023794 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); STANDARD_PARAM_DEF(long, long, "%li", kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9a83d780facd..e4e4121fa327 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -253,9 +253,6 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config ARCH_HAS_OPP - bool - config PM_OPP bool ---help--- diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e90f330f139..9a59d042ea84 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (pm_states[i].state) - s += sprintf(s,"%s ", pm_states[i].label); + if (pm_states[i]) + s += sprintf(s,"%s ", pm_states[i]); #endif if (hibernation_available()) @@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_MIN; - struct pm_sleep_state *s; + suspend_state_t state; #endif char *p; int len; @@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (s->state && len == strlen(s->label) - && !strncmp(buf, s->label, len)) - return s->state; + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = pm_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } #endif return PM_SUSPEND_ON; @@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", pm_states[state].state ? - pm_states[state].label : "error"); + return sprintf(buf, "%s\n", pm_states[state] ? + pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); @@ -615,7 +616,6 @@ static struct attribute_group attr_group = { .attrs = g, }; -#ifdef CONFIG_PM_RUNTIME struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); @@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void) return pm_wq ? 0 : -ENOMEM; } -#else -static inline int pm_start_workqueue(void) { return 0; } -#endif static int __init pm_init(void) { diff --git a/kernel/power/power.h b/kernel/power/power.h index c60f13b5270a..5d49dcac2537 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND -struct pm_sleep_state { - const char *label; - suspend_state_t state; -}; - /* kernel/power/suspend.c */ -extern struct pm_sleep_state pm_states[]; +extern const char *pm_states[]; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1ea328aafdc9..c4b8093c80b3 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) * information is stored (in the form of a block of bitmap) * It also contains the pfns that correspond to the start and end of * the represented memory area. + * + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). + * + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap. + * + * The struct rtree_node represents one node of the radix tree. */ #define BM_END_OF_MAP (~0UL) #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) +#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) +#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) -struct bm_block { - struct list_head hook; /* hook into a list of bitmap blocks */ - unsigned long start_pfn; /* pfn represented by the first bit */ - unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned long *data; /* bitmap representing pages */ +/* + * struct rtree_node is a wrapper struct to link the nodes + * of the rtree together for easy linear iteration over + * bits and easy freeing + */ +struct rtree_node { + struct list_head list; + unsigned long *data; }; -static inline unsigned long bm_block_bits(struct bm_block *bb) -{ - return bb->end_pfn - bb->start_pfn; -} +/* + * struct mem_zone_bm_rtree represents a bitmap used for one + * populated memory zone. + */ +struct mem_zone_bm_rtree { + struct list_head list; /* Link Zones together */ + struct list_head nodes; /* Radix Tree inner nodes */ + struct list_head leaves; /* Radix Tree leaves */ + unsigned long start_pfn; /* Zone start page frame */ + unsigned long end_pfn; /* Zone end page frame + 1 */ + struct rtree_node *rtree; /* Radix Tree Root */ + int levels; /* Number of Radix Tree Levels */ + unsigned int blocks; /* Number of Bitmap Blocks */ +}; /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct bm_block *block; - int bit; + struct mem_zone_bm_rtree *zone; + struct rtree_node *node; + unsigned long node_pfn; + int node_bit; }; struct memory_bitmap { - struct list_head blocks; /* list of bitmap blocks */ + struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -284,38 +312,178 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ -static void memory_bm_position_reset(struct memory_bitmap *bm) +#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long)) +#if BITS_PER_LONG == 32 +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2) +#else +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3) +#endif +#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) + +/* + * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. + */ +static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + struct list_head *list) { - bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); - bm->cur.bit = 0; -} + struct rtree_node *node; -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + node = chain_alloc(ca, sizeof(struct rtree_node)); + if (!node) + return NULL; -/** - * create_bm_block_list - create a list of block bitmap objects - * @pages - number of pages to track - * @list - list to put the allocated blocks into - * @ca - chain allocator to be used for allocating memory + node->data = get_image_page(gfp_mask, safe_needed); + if (!node->data) + return NULL; + + list_add_tail(&node->list, list); + + return node; +} + +/* + * add_rtree_block - Add a new leave node to the radix tree + * + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. */ -static int create_bm_block_list(unsigned long pages, - struct list_head *list, - struct chain_allocator *ca) +static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, + int safe_needed, struct chain_allocator *ca) { - unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + struct rtree_node *node, *block, **dst; + unsigned int levels_needed, block_nr; + int i; - while (nr_blocks-- > 0) { - struct bm_block *bb; + block_nr = zone->blocks; + levels_needed = 0; - bb = chain_alloc(ca, sizeof(struct bm_block)); - if (!bb) + /* How many levels do we need for this block nr? */ + while (block_nr) { + levels_needed += 1; + block_nr >>= BM_RTREE_LEVEL_SHIFT; + } + + /* Make sure the rtree has enough levels */ + for (i = zone->levels; i < levels_needed; i++) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) return -ENOMEM; - list_add(&bb->hook, list); + + node->data[0] = (unsigned long)zone->rtree; + zone->rtree = node; + zone->levels += 1; } + /* Allocate new block */ + block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); + if (!block) + return -ENOMEM; + + /* Now walk the rtree to insert the block */ + node = zone->rtree; + dst = &zone->rtree; + block_nr = zone->blocks; + for (i = zone->levels; i > 0; i--) { + int index; + + if (!node) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) + return -ENOMEM; + *dst = node; + } + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + dst = (struct rtree_node **)&((*dst)->data[index]); + node = *dst; + } + + zone->blocks += 1; + *dst = block; + return 0; } +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free); + +/* + * create_zone_bm_rtree - create a radix tree for one zone + * + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. + */ +static struct mem_zone_bm_rtree * +create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + unsigned long start, unsigned long end) +{ + struct mem_zone_bm_rtree *zone; + unsigned int i, nr_blocks; + unsigned long pages; + + pages = end - start; + zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); + if (!zone) + return NULL; + + INIT_LIST_HEAD(&zone->nodes); + INIT_LIST_HEAD(&zone->leaves); + zone->start_pfn = start; + zone->end_pfn = end; + nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + + for (i = 0; i < nr_blocks; i++) { + if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { + free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); + return NULL; + } + } + + return zone; +} + +/* + * free_zone_bm_rtree - Free the memory of the radix tree + * + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. + */ +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + free_image_page(node->data, clear_nosave_free); + + list_for_each_entry(node, &zone->leaves, list) + free_image_page(node->data, clear_nosave_free); +} + +static void memory_bm_position_reset(struct memory_bitmap *bm) +{ + bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + list); + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; +} + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + struct mem_extent { struct list_head hook; unsigned long start; @@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) int error; chain_init(&ca, gfp_mask, safe_needed); - INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); error = create_mem_extents(&mem_extents, gfp_mask); if (error) return error; list_for_each_entry(ext, &mem_extents, hook) { - struct bm_block *bb; - unsigned long pfn = ext->start; - unsigned long pages = ext->end - ext->start; + struct mem_zone_bm_rtree *zone; - bb = list_entry(bm->blocks.prev, struct bm_block, hook); - - error = create_bm_block_list(pages, bm->blocks.prev, &ca); - if (error) + zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, + ext->start, ext->end); + if (!zone) { + error = -ENOMEM; goto Error; - - list_for_each_entry_continue(bb, &bm->blocks, hook) { - bb->data = get_image_page(gfp_mask, safe_needed); - if (!bb->data) { - error = -ENOMEM; - goto Error; - } - - bb->start_pfn = pfn; - if (pages >= BM_BITS_PER_BLOCK) { - pfn += BM_BITS_PER_BLOCK; - pages -= BM_BITS_PER_BLOCK; - } else { - /* This is executed only once in the loop */ - pfn += pages; - } - bb->end_pfn = pfn; } + list_add_tail(&zone->list, &bm->zones); } bm->p_list = ca.chain; @@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) */ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct bm_block *bb; + struct mem_zone_bm_rtree *zone; - list_for_each_entry(bb, &bm->blocks, hook) - if (bb->data) - free_image_page(bb->data, clear_nosave_free); + list_for_each_entry(zone, &bm->zones, list) + free_zone_bm_rtree(zone, clear_nosave_free); free_list_of_pages(bm->p_list, clear_nosave_free); - INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); } /** - * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds - * to given pfn. The cur_zone_bm member of @bm and the cur_block member - * of @bm->cur_zone_bm are updated. + * memory_bm_find_bit - Find the bit for pfn in the memory + * bitmap + * + * Find the bit in the bitmap @bm that corresponds to given pfn. + * The cur.zone, cur.block and cur.node_pfn member of @bm are + * updated. + * It walks the radix tree to find the page which contains the bit for + * pfn and returns the bit position in **addr and *bit_nr. */ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) + void **addr, unsigned int *bit_nr) { - struct bm_block *bb; + struct mem_zone_bm_rtree *curr, *zone; + struct rtree_node *node; + int i, block_nr; + + zone = bm->cur.zone; + + if (pfn >= zone->start_pfn && pfn < zone->end_pfn) + goto zone_found; + + zone = NULL; + + /* Find the right zone */ + list_for_each_entry(curr, &bm->zones, list) { + if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { + zone = curr; + break; + } + } + if (!zone) + return -EFAULT; + +zone_found: /* - * Check if the pfn corresponds to the current bitmap block and find - * the block where it fits if this is not the case. + * We have a zone. Now walk the radix tree to find the leave + * node for our pfn. */ - bb = bm->cur.block; - if (pfn < bb->start_pfn) - list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn) - break; - if (pfn >= bb->end_pfn) - list_for_each_entry_continue(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn && pfn < bb->end_pfn) - break; + node = bm->cur.node; + if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + goto node_found; - if (&bb->hook == &bm->blocks) - return -EFAULT; + node = zone->rtree; + block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; + + for (i = zone->levels; i > 0; i--) { + int index; + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + BUG_ON(node->data[index] == 0); + node = (struct rtree_node *)node->data[index]; + } + +node_found: + /* Update last position */ + bm->cur.zone = zone; + bm->cur.node = node; + bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + + /* Set return values */ + *addr = node->data; + *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; - /* The block has been found */ - bm->cur.block = bb; - pfn -= bb->start_pfn; - bm->cur.bit = pfn + 1; - *bit_nr = pfn; - *addr = bb->data; return 0; } @@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); if (!error) set_bit(bit, addr); + return error; } @@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ + int bit; + + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) return !memory_bm_find_bit(bm, pfn, &addr, &bit); } -/** - * memory_bm_next_pfn - find the pfn that corresponds to the next set bit - * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is - * returned. +/* + * rtree_next_node - Jumps to the next leave node * - * It is required to run memory_bm_position_reset() before the first call to - * this function. + * Sets the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. + * + * Returns true if there is a next node, false otherwise. */ +static bool rtree_next_node(struct memory_bitmap *bm) +{ + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); + if (&bm->cur.node->list != &bm->cur.zone->leaves) { + bm->cur.node_pfn += BM_BITS_PER_BLOCK; + bm->cur.node_bit = 0; + touch_softlockup_watchdog(); + return true; + } + + /* No more nodes, goto next zone */ + bm->cur.zone = list_entry(bm->cur.zone->list.next, + struct mem_zone_bm_rtree, list); + if (&bm->cur.zone->list != &bm->zones) { + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; + return true; + } + + /* No more zones */ + return false; +} +/** + * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm + * + * Starting from the last returned position this function searches + * for the next set bit in the memory bitmap and returns its + * number. If no more bit is set BM_END_OF_MAP is returned. + * + * It is required to run memory_bm_position_reset() before the + * first call to this function. + */ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct bm_block *bb; + unsigned long bits, pfn, pages; int bit; - bb = bm->cur.block; do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = list_entry(bb->hook.next, struct bm_block, hook); - bm->cur.block = bb; - bm->cur.bit = 0; - } while (&bb->hook != &bm->blocks); + pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; + bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur.node->data, bits, + bm->cur.node_bit); + if (bit < bits) { + pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; + bm->cur.node_bit = bit + 1; + return pfn; + } + } while (rtree_next_node(bm)); - memory_bm_position_reset(bm); return BM_END_OF_MAP; - - Return_pfn: - bm->cur.bit = bit + 1; - return bb->start_pfn + bit; } /** @@ -731,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } } +static bool is_nosave_page(unsigned long pfn) +{ + struct nosave_region *region; + + list_for_each_entry(region, &nosave_regions, list) { + if (pfn >= region->start_pfn && pfn < region->end_pfn) { + pr_err("PM: %#010llx in e820 nosave region: " + "[mem %#010llx-%#010llx]\n", + (unsigned long long) pfn << PAGE_SHIFT, + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); + return true; + } + } + + return false; +} + /** * create_basic_memory_bitmaps - create bitmaps needed for marking page * frames that should not be saved and free page frames. The pointers @@ -816,12 +1058,17 @@ void free_basic_memory_bitmaps(void) unsigned int snapshot_additional_pages(struct zone *zone) { - unsigned int res; + unsigned int rtree, nodes; - res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), - LINKED_PAGE_DATA_SIZE); - return 2 * res; + rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), + LINKED_PAGE_DATA_SIZE); + while (nodes > 1) { + nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); + rtree += nodes; + } + + return 2 * rtree; } #ifdef CONFIG_HIGHMEM @@ -1094,23 +1341,35 @@ static struct memory_bitmap copy_bm; void swsusp_free(void) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long fb_pfn, fr_pfn; - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } + memory_bm_position_reset(forbidden_pages_map); + memory_bm_position_reset(free_pages_map); + +loop: + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + + /* + * Find the next bit set in both bitmaps. This is guaranteed to + * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. + */ + do { + if (fb_pfn < fr_pfn) + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + if (fr_pfn < fb_pfn) + fr_pfn = memory_bm_next_pfn(free_pages_map); + } while (fb_pfn != fr_pfn); + + if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { + struct page *page = pfn_to_page(fr_pfn); + + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); + __free_page(page); + goto loop; } + nr_copy_pages = 0; nr_meta_pages = 0; restore_pblist = NULL; @@ -1775,7 +2034,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) do { pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) + if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) swsusp_set_page_free(pfn_to_page(pfn)); else return -EFAULT; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4b736b4dfa96..6dadb25cb0d8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,20 +31,11 @@ #include "power.h" -struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, - [PM_SUSPEND_STANDBY] = { .label = "standby", }, - [PM_SUSPEND_MEM] = { .label = "mem", }, -}; +static const char *pm_labels[] = { "mem", "standby", "freeze", }; +const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; - -static bool need_suspend_ops(suspend_state_t state) -{ - return state > PM_SUSPEND_FREEZE; -} - static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); static bool suspend_freeze_wake; @@ -97,10 +88,7 @@ static bool relative_states; static int __init sleep_states_setup(char *str) { relative_states = !strncmp(str, "1", 1); - if (relative_states) { - pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; - pm_states[PM_SUSPEND_FREEZE].state = 0; - } + pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; return 1; } @@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup); void suspend_set_ops(const struct platform_suspend_ops *ops) { suspend_state_t i; - int j = PM_SUSPEND_MAX - 1; + int j = 0; lock_system_sleep(); suspend_ops = ops; for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) - pm_states[j--].state = i; - else if (!relative_states) - pm_states[j--].state = 0; + if (valid_state(i)) { + pm_states[i] = pm_labels[j++]; + } else if (!relative_states) { + pm_states[i] = NULL; + j++; + } - pm_states[j--].state = PM_SUSPEND_FREEZE; - while (j >= PM_SUSPEND_MIN) - pm_states[j--].state = 0; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; unlock_system_sleep(); } @@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state) } EXPORT_SYMBOL_GPL(suspend_valid_only_mem); +static bool sleep_state_supported(suspend_state_t state) +{ + return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter); +} + +static int platform_suspend_prepare(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ? + suspend_ops->prepare() : 0; +} + +static int platform_suspend_prepare_late(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? + suspend_ops->prepare_late() : 0; +} + +static void platform_suspend_wake(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) + suspend_ops->wake(); +} + +static void platform_suspend_finish(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) + suspend_ops->finish(); +} + +static int platform_suspend_begin(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) + return freeze_ops->begin(); + else if (suspend_ops->begin) + return suspend_ops->begin(state); + else + return 0; +} + +static void platform_suspend_end(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) + freeze_ops->end(); + else if (suspend_ops->end) + suspend_ops->end(); +} + +static void platform_suspend_recover(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) + suspend_ops->recover(); +} + +static bool platform_suspend_again(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ? + suspend_ops->suspend_again() : false; +} + static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG @@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state) { int error; - if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) + if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); @@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (need_suspend_ops(state) && suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Platform_finish; - } + error = platform_suspend_prepare(state); + if (error) + goto Platform_finish; error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; } - - if (need_suspend_ops(state) && suspend_ops->prepare_late) { - error = suspend_ops->prepare_late(); - if (error) - goto Platform_wake; - } + error = platform_suspend_prepare_late(state); + if (error) + goto Platform_wake; if (suspend_test(TEST_PLATFORM)) goto Platform_wake; @@ -276,15 +318,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) enable_nonboot_cpus(); Platform_wake: - if (need_suspend_ops(state) && suspend_ops->wake) - suspend_ops->wake(); - + platform_suspend_wake(state); dpm_resume_start(PMSG_RESUME); Platform_finish: - if (need_suspend_ops(state) && suspend_ops->finish) - suspend_ops->finish(); - + platform_suspend_finish(state); return error; } @@ -297,18 +335,13 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (need_suspend_ops(state) && !suspend_ops) + if (!sleep_state_supported(state)) return -ENOSYS; - if (need_suspend_ops(state) && suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { - error = freeze_ops->begin(); - if (error) - goto Close; - } + error = platform_suspend_begin(state); + if (error) + goto Close; + suspend_console(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); @@ -322,25 +355,20 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup && need_suspend_ops(state) - && suspend_ops->suspend_again && suspend_ops->suspend_again()); + } while (!error && !wakeup && platform_suspend_again(state)); Resume_devices: suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); resume_console(); - Close: - if (need_suspend_ops(state) && suspend_ops->end) - suspend_ops->end(); - else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) - freeze_ops->end(); + Close: + platform_suspend_end(state); return error; Recover_platform: - if (need_suspend_ops(state) && suspend_ops->recover) - suspend_ops->recover(); + platform_suspend_recover(state); goto Resume_devices; } @@ -393,7 +421,7 @@ static int enter_state(suspend_state_t state) printk("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); error = suspend_prepare(state); if (error) goto Unlock; @@ -402,7 +430,7 @@ static int enter_state(suspend_state_t state) goto Finish; trace_suspend_resume(TPS("suspend_enter"), state, false); - pr_debug("PM: Entering %s sleep\n", pm_states[state].label); + pr_debug("PM: Entering %s sleep\n", pm_states[state]); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 269b097e78ea..2f524928b6aa 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state].label); + printk(info_test, pm_states[state]); status = pm_suspend(state); if (status == -ENODEV) state = PM_SUSPEND_STANDBY; } if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state].label); + printk(info_test, pm_states[state]); status = pm_suspend(state); } if (status < 0) @@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value) /* "=mem" ==> "mem" */ value++; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (!strcmp(pm_states[i].label, value)) { - test_state = pm_states[i].state; + if (!strcmp(pm_states[i], value)) { + test_state = i; return 0; } @@ -162,8 +162,8 @@ static int __init test_suspend(void) /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) goto done; - if (!pm_states[test_state].state) { - printk(warn_bad_state, pm_states[test_state].label); + if (!pm_states[test_state]) { + printk(warn_bad_state, pm_states[test_state]); goto done; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839dbca07..e04c455a0e38 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@ #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/utsname.h> +#include <linux/ctype.h> #include <asm/uaccess.h> @@ -56,7 +57,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; @@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +147,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -257,7 +258,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -266,10 +267,23 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* Return log buffer address */ +char *log_buf_addr_get(void) +{ + return log_buf; +} + +/* Return log buffer size */ +u32 log_buf_len_get(void) +{ + return log_buf_len; +} + /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { @@ -344,7 +358,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -453,11 +467,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -828,34 +838,74 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { @@ -872,7 +922,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -881,7 +931,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { - ignore_loglevel = 1; + ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; @@ -947,11 +997,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) @@ -1310,7 +1356,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; @@ -1416,10 +1462,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1432,8 +1477,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1476,7 +1523,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1608,7 +1655,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1716,21 +1764,30 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + lockdep_off(); + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); + preempt_enable(); + lockdep_on(); } - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -1802,7 +1859,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1881,11 +1938,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -1902,7 +1960,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1911,7 +1970,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1950,7 +2009,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha i++, c++) if (strcmp(c->name, name) == 0 && c->index == idx) { strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; return i; @@ -1959,12 +2017,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { - console_suspend_enabled = 0; + console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); @@ -2045,8 +2103,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ @@ -2618,14 +2676,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..da14b8d09296 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name) BUG_ON(start >= end); read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + if (first_level_children_only) { + p = iomem_resource.child; + sibling_only = true; + } else + p = &iomem_resource; + + while ((p = next_resource(p, sibling_only))) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name) } /* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + +/* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". @@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1211575a2208..ec1a286684a5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2393,6 +2393,13 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *this = this_rq(); + *nr_waiters = atomic_read(&this->nr_iowait); + *load = this->cpu_load[0]; +} + #ifdef CONFIG_SMP /* diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9f1608f99819..11e7bc434f43 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -147,8 +147,6 @@ use_default: clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; - trace_cpu_idle_rcuidle(next_state, dev->cpu); - /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take @@ -156,8 +154,6 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); - if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30f9c88..8ecd552fe4f2 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -8,13 +8,6 @@ #include "sched.h" -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - /* * Global load-average calculations * diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 301bbc24739c..44eb005c6695 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -18,15 +18,17 @@ #include <linux/compat.h> #include <linux/sched.h> #include <linux/seccomp.h> +#include <linux/slab.h> +#include <linux/syscalls.h> /* #define SECCOMP_DEBUG 1 */ #ifdef CONFIG_SECCOMP_FILTER #include <asm/syscall.h> #include <linux/filter.h> +#include <linux/pid.h> #include <linux/ptrace.h> #include <linux/security.h> -#include <linux/slab.h> #include <linux/tracehook.h> #include <linux/uaccess.h> @@ -54,7 +56,7 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; - struct sk_filter *prog; + struct bpf_prog *prog; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -87,7 +89,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) * @filter: filter to verify * @flen: length of filter * - * Takes a previously checked filter (by sk_chk_filter) and + * Takes a previously checked filter (by bpf_check_classic) and * redirects all filter code that loads struct sk_buff data * and related data through seccomp_bpf_load. It also * enforces length and alignment checking of those loads. @@ -172,51 +174,184 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(int syscall) { - struct seccomp_filter *f; + struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ - if (WARN_ON(current->seccomp.filter == NULL)) + if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; + /* Make sure cross-thread synced filter points somewhere sane. */ + smp_read_barrier_depends(); + populate_seccomp_data(&sd); /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ - for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); + for (; f; f = f->prev) { + u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } return ret; } +#endif /* CONFIG_SECCOMP_FILTER */ + +static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) +{ + assert_spin_locked(¤t->sighand->siglock); + + if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) + return false; + + return true; +} + +static inline void seccomp_assign_mode(struct task_struct *task, + unsigned long seccomp_mode) +{ + assert_spin_locked(&task->sighand->siglock); + + task->seccomp.mode = seccomp_mode; + /* + * Make sure TIF_SECCOMP cannot be set before the mode (and + * filter) is set. + */ + smp_mb__before_atomic(); + set_tsk_thread_flag(task, TIF_SECCOMP); +} + +#ifdef CONFIG_SECCOMP_FILTER +/* Returns 1 if the parent is an ancestor of the child. */ +static int is_ancestor(struct seccomp_filter *parent, + struct seccomp_filter *child) +{ + /* NULL is the root ancestor. */ + if (parent == NULL) + return 1; + for (; child; child = child->prev) + if (child == parent) + return 1; + return 0; +} /** - * seccomp_attach_filter: Attaches a seccomp filter to current. + * seccomp_can_sync_threads: checks if all threads can be synchronized + * + * Expects sighand and cred_guard_mutex locks to be held. + * + * Returns 0 on success, -ve on error, or the pid of a thread which was + * either not in the correct seccomp mode or it did not have an ancestral + * seccomp filter. + */ +static inline pid_t seccomp_can_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + assert_spin_locked(¤t->sighand->siglock); + + /* Validate all threads being eligible for synchronization. */ + caller = current; + for_each_thread(caller, thread) { + pid_t failed; + + /* Skip current, since it is initiating the sync. */ + if (thread == caller) + continue; + + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || + (thread->seccomp.mode == SECCOMP_MODE_FILTER && + is_ancestor(thread->seccomp.filter, + caller->seccomp.filter))) + continue; + + /* Return the first thread that cannot be synchronized. */ + failed = task_pid_vnr(thread); + /* If the pid cannot be resolved, then return -ESRCH */ + if (unlikely(WARN_ON(failed == 0))) + failed = -ESRCH; + return failed; + } + + return 0; +} + +/** + * seccomp_sync_threads: sets all threads to use current's filter + * + * Expects sighand and cred_guard_mutex locks to be held, and for + * seccomp_can_sync_threads() to have returned success already + * without dropping the locks. + * + */ +static inline void seccomp_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + assert_spin_locked(¤t->sighand->siglock); + + /* Synchronize all threads. */ + caller = current; + for_each_thread(caller, thread) { + /* Skip current, since it needs no changes. */ + if (thread == caller) + continue; + + /* Get a task reference for the new leaf node. */ + get_seccomp_filter(caller); + /* + * Drop the task reference to the shared ancestor since + * current's path will hold a reference. (This also + * allows a put before the assignment.) + */ + put_seccomp_filter(thread); + smp_store_release(&thread->seccomp.filter, + caller->seccomp.filter); + /* + * Opt the other thread into seccomp if needed. + * As threads are considered to be trust-realm + * equivalent (see ptrace_may_access), it is safe to + * allow one thread to transition the other. + */ + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + } + } +} + +/** + * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install * - * Returns 0 on success or an errno on failure. + * Returns filter on success or an ERR_PTR on failure. */ -static long seccomp_attach_filter(struct sock_fprog *fprog) +static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *filter; - unsigned long fp_size = fprog->len * sizeof(struct sock_filter); - unsigned long total_insns = fprog->len; + unsigned long fp_size; struct sock_filter *fp; int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) - return -EINVAL; - - for (filter = current->seccomp.filter; filter; filter = filter->prev) - total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ - if (total_insns > MAX_INSNS_PER_PATH) - return -ENOMEM; + return ERR_PTR(-EINVAL); + BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); + fp_size = fprog->len * sizeof(struct sock_filter); /* * Installing a seccomp filter requires that the task has @@ -224,14 +359,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. */ - if (!current->no_new_privs && + if (!task_no_new_privs(current) && security_capable_noaudit(current_cred(), current_user_ns(), CAP_SYS_ADMIN) != 0) - return -EACCES; + return ERR_PTR(-EACCES); fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); if (!fp) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* Copy the instructions from fprog. */ ret = -EFAULT; @@ -239,7 +374,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(fp, fprog->len); + ret = bpf_check_classic(fp, fprog->len); if (ret) goto free_prog; @@ -248,8 +383,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (ret) goto free_prog; - /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ - ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); + /* Convert 'sock_filter' insns to 'bpf_insn' insns */ + ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len); if (ret) goto free_prog; @@ -260,12 +395,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(sk_filter_size(new_len), + filter->prog = kzalloc(bpf_prog_size(new_len), GFP_KERNEL|__GFP_NOWARN); if (!filter->prog) goto free_filter; - ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); + ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; kfree(fp); @@ -273,15 +408,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) atomic_set(&filter->usage, 1); filter->prog->len = new_len; - sk_filter_select_runtime(filter->prog); + bpf_prog_select_runtime(filter->prog); - /* - * If there is an existing filter, make it the prev and don't drop its - * task reference. - */ - filter->prev = current->seccomp.filter; - current->seccomp.filter = filter; - return 0; + return filter; free_filter_prog: kfree(filter->prog); @@ -289,19 +418,20 @@ free_filter: kfree(filter); free_prog: kfree(fp); - return ret; + return ERR_PTR(ret); } /** - * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog * @user_filter: pointer to the user data containing a sock_fprog. * * Returns 0 on success and non-zero otherwise. */ -static long seccomp_attach_user_filter(char __user *user_filter) +static struct seccomp_filter * +seccomp_prepare_user_filter(const char __user *user_filter) { struct sock_fprog fprog; - long ret = -EFAULT; + struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT if (is_compat_task()) { @@ -314,9 +444,56 @@ static long seccomp_attach_user_filter(char __user *user_filter) #endif if (copy_from_user(&fprog, user_filter, sizeof(fprog))) goto out; - ret = seccomp_attach_filter(&fprog); + filter = seccomp_prepare_filter(&fprog); out: - return ret; + return filter; +} + +/** + * seccomp_attach_filter: validate and attach filter + * @flags: flags to change filter behavior + * @filter: seccomp filter to add to the current process + * + * Caller must be holding current->sighand->siglock lock. + * + * Returns 0 on success, -ve on error. + */ +static long seccomp_attach_filter(unsigned int flags, + struct seccomp_filter *filter) +{ + unsigned long total_insns; + struct seccomp_filter *walker; + + assert_spin_locked(¤t->sighand->siglock); + + /* Validate resulting filter length. */ + total_insns = filter->prog->len; + for (walker = current->seccomp.filter; walker; walker = walker->prev) + total_insns += walker->prog->len + 4; /* 4 instr penalty */ + if (total_insns > MAX_INSNS_PER_PATH) + return -ENOMEM; + + /* If thread sync has been requested, check that it is possible. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + int ret; + + ret = seccomp_can_sync_threads(); + if (ret) + return ret; + } + + /* + * If there is an existing filter, make it the prev and don't drop its + * task reference. + */ + filter->prev = current->seccomp.filter; + current->seccomp.filter = filter; + + /* Now that the new filter is in place, synchronize to all threads. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + seccomp_sync_threads(); + + return 0; } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -329,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk) atomic_inc(&orig->usage); } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_free(filter->prog); + kfree(filter); + } +} + /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ void put_seccomp_filter(struct task_struct *tsk) { @@ -337,8 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - sk_filter_free(freeme->prog); - kfree(freeme); + seccomp_filter_free(freeme); } } @@ -382,12 +566,17 @@ static int mode1_syscalls_32[] = { int __secure_computing(int this_syscall) { - int mode = current->seccomp.mode; int exit_sig = 0; int *syscall; u32 ret; - switch (mode) { + /* + * Make sure that any changes to mode from another thread have + * been seen after TIF_SECCOMP was seen. + */ + rmb(); + + switch (current->seccomp.mode) { case SECCOMP_MODE_STRICT: syscall = mode1_syscalls; #ifdef CONFIG_COMPAT @@ -473,47 +662,152 @@ long prctl_get_seccomp(void) } /** - * prctl_set_seccomp: configures current->seccomp.mode - * @seccomp_mode: requested mode to use - * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * seccomp_set_mode_strict: internal function for setting strict seccomp * - * This function may be called repeatedly with a @seccomp_mode of - * SECCOMP_MODE_FILTER to install additional filters. Every filter - * successfully installed will be evaluated (in reverse order) for each system - * call the task makes. + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +static long seccomp_set_mode_strict(void) +{ + const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; + long ret = -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + + if (!seccomp_may_assign_mode(seccomp_mode)) + goto out; + +#ifdef TIF_NOTSC + disable_TSC(); +#endif + seccomp_assign_mode(current, seccomp_mode); + ret = 0; + +out: + spin_unlock_irq(¤t->sighand->siglock); + + return ret; +} + +#ifdef CONFIG_SECCOMP_FILTER +/** + * seccomp_set_mode_filter: internal function for setting seccomp filter + * @flags: flags to change filter behavior + * @filter: struct sock_fprog containing filter + * + * This function may be called repeatedly to install additional filters. + * Every filter successfully installed will be evaluated (in reverse order) + * for each system call the task makes. * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +static long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) { + const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; + struct seccomp_filter *prepared = NULL; long ret = -EINVAL; - if (current->seccomp.mode && - current->seccomp.mode != seccomp_mode) + /* Validate flags. */ + if (flags & ~SECCOMP_FILTER_FLAG_MASK) + return -EINVAL; + + /* Prepare the new filter before holding any locks. */ + prepared = seccomp_prepare_user_filter(filter); + if (IS_ERR(prepared)) + return PTR_ERR(prepared); + + /* + * Make sure we cannot change seccomp or nnp state via TSYNC + * while another thread is in the middle of calling exec. + */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC && + mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_free; + + spin_lock_irq(¤t->sighand->siglock); + + if (!seccomp_may_assign_mode(seccomp_mode)) + goto out; + + ret = seccomp_attach_filter(flags, prepared); + if (ret) goto out; + /* Do not free the successfully attached filter. */ + prepared = NULL; + + seccomp_assign_mode(current, seccomp_mode); +out: + spin_unlock_irq(¤t->sighand->siglock); + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + mutex_unlock(¤t->signal->cred_guard_mutex); +out_free: + seccomp_filter_free(prepared); + return ret; +} +#else +static inline long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) +{ + return -EINVAL; +} +#endif + +/* Common entry point for both prctl and syscall. */ +static long do_seccomp(unsigned int op, unsigned int flags, + const char __user *uargs) +{ + switch (op) { + case SECCOMP_SET_MODE_STRICT: + if (flags != 0 || uargs != NULL) + return -EINVAL; + return seccomp_set_mode_strict(); + case SECCOMP_SET_MODE_FILTER: + return seccomp_set_mode_filter(flags, uargs); + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, + const char __user *, uargs) +{ + return do_seccomp(op, flags, uargs); +} + +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +{ + unsigned int op; + char __user *uargs; switch (seccomp_mode) { case SECCOMP_MODE_STRICT: - ret = 0; -#ifdef TIF_NOTSC - disable_TSC(); -#endif + op = SECCOMP_SET_MODE_STRICT; + /* + * Setting strict mode through prctl always ignored filter, + * so make sure it is always NULL here to pass the internal + * check in do_seccomp(). + */ + uargs = NULL; break; -#ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: - ret = seccomp_attach_user_filter(filter); - if (ret) - goto out; + op = SECCOMP_SET_MODE_FILTER; + uargs = filter; break; -#endif default: - goto out; + return -EINVAL; } - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); -out: - return ret; + /* prctl interface doesn't have flags, so they are always zero. */ + return do_seccomp(op, 0, uargs); } diff --git a/kernel/signal.c b/kernel/signal.c index 40b76e351e64..8f0876f9f6dd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, - struct pt_regs *regs, void *cookie) +int get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2241,13 +2240,13 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, info); + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info); + signr = ptrace_signal(signr, &ksig->info); if (!signr) continue; } @@ -2255,13 +2254,13 @@ relock: ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ - trace_signal_deliver(signr, info, ka); + trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ - *return_ka = *ka; + ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; @@ -2311,7 +2310,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(info->si_signo))) { + if (likely(do_signal_stop(ksig->info.si_signo))) { /* It released the siglock. */ goto relock; } @@ -2332,7 +2331,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(info->si_signo); + print_fatal_signal(ksig->info.si_signo); proc_coredump_connector(current); /* * If it was able to dump core, this kills all @@ -2342,34 +2341,32 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info); + do_coredump(&ksig->info); } /* * Death signals, no core dump. */ - do_group_exit(info->si_signo); + do_group_exit(ksig->info.si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); - return signr; + + ksig->sig = signr; + return ksig->sig > 0; } /** * signal_delivered - - * @sig: number of signal being delivered - * @info: siginfo_t of signal being delivered - * @ka: sigaction setting that chose the handler - * @regs: user register state + * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been - * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER - * is set in @ka->sa.sa_flags. Tracing is notified. + * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ -void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, - struct pt_regs *regs, int stepping) +static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; @@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, simply clear the restore sigmask flag. */ clear_restore_sigmask(); - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); + sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); + if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); - tracehook_signal_handler(sig, info, ka, regs, stepping); + tracehook_signal_handler(stepping); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) @@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping) if (failed) force_sigsegv(ksig->sig, current); else - signal_delivered(ksig->sig, &ksig->info, &ksig->ka, - signal_pt_regs(), stepping); + signal_delivered(ksig, stepping); } /* diff --git a/kernel/smp.c b/kernel/smp.c index 487653b5844f..aff8aa14f547 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); - WARN_ON_ONCE(!ret); + WARN_ON_ONCE(ret); } preempt_enable(); } diff --git a/kernel/sys.c b/kernel/sys.c index 66a751ebf9d9..ce8129192a26 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - current->no_new_privs = 1; + task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - return current->no_new_privs ? 1 : 0; + return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 36441b51b5df..391d4ddb6f4b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); @@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create); /* performance counters: */ cond_syscall(sys_perf_event_open); @@ -213,3 +215,6 @@ cond_syscall(compat_sys_open_by_handle_at); /* compare kernel pointers */ cond_syscall(sys_kcmp); + +/* operate on Secure Computing state */ +cond_syscall(sys_seccomp); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e22a72c..75875a741b5e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd9e7ad..e4ba9a5a5ccb 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = { { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, {} }; diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 52ebc70263f4..875f64e8935b 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void) pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", PTR_ERR(key)); } else { + set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); pr_notice("Loaded X.509 cert '%s'\n", key_ref_to_ptr(key)->description); key_ref_put(key); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbfdd83..0dbab6d1acb4 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@ * the GNU General Public License for more details. */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt + #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/random.h> @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); + pr_err("incorrect value in post_handler\n"); } posth_val = preh_val + div_factor; } @@ -59,8 +60,7 @@ static int test_kprobe(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); + pr_err("register_kprobe returned %d\n", ret); return ret; } @@ -68,14 +68,12 @@ static int test_kprobe(void) unregister_kprobe(&kp); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); + pr_err("incorrect value in post_handler2\n"); } posth_val = preh_val + div_factor; } @@ -120,8 +117,7 @@ static int test_kprobes(void) kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); + pr_err("register_kprobes returned %d\n", ret); return ret; } @@ -130,14 +126,12 @@ static int test_kprobes(void) ret = target(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -146,14 +140,12 @@ static int test_kprobes(void) ret = target2(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); + pr_err("kprobe pre_handler2 not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); + pr_err("kprobe post_handler2 not called\n"); handler_errors++; } @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) { if (value != rand1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); + pr_err("incorrect value in jprobe handler\n"); } jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void) ret = register_jprobe(&jp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); + pr_err("register_jprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } @@ -217,24 +206,21 @@ static int test_jprobes(void) jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); + pr_err("register_jprobes returned %d\n", ret); return ret; } jph_val = 0; ret = target(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } jph_val = 0; ret = target2(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); + pr_err("jprobe handler2 not called\n"); handler_errors++; } unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); + pr_err("incorrect value in kretprobe handler\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void) ret = register_kretprobe(&rp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); + pr_err("incorrect value in kretprobe handler2\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void) rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } krph_val = 0; ret = target(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } krph_val = 0; ret = target2(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); + pr_err("kretprobe handler2 not called\n"); handler_errors++; } unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void) rand1 = prandom_u32(); } while (rand1 <= div_factor); - printk(KERN_INFO "Kprobe smoke test started\n"); + pr_info("started\n"); num_tests++; ret = test_kprobe(); if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void) #endif /* CONFIG_KRETPROBES */ if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); + pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); + pr_err("BUG: %d error(s) running handlers\n", handler_errors); else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + pr_info("passed successfully\n"); return 0; } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f448513a45ed..d626dc98e8df 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool +# Clocksources require validation of the clocksource against the last +# cycle update - x86/TSC misfeature +config CLOCKSOURCE_VALIDATE_LAST_CYCLE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool @@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL config GENERIC_TIME_VSYSCALL_OLD bool -# ktime_t scalar 64bit nsec representation -config KTIME_SCALAR - bool - # Old style timekeeping config ARCH_USES_GETTIMEOFFSET bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 57a413fd0ebf..7347426fa68d 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,3 +1,4 @@ +obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o posix-clock.o alarmtimer.o @@ -12,3 +13,21 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o +obj-$(CONFIG_TEST_UDELAY) += udelay_test.o + +$(obj)/time.o: $(obj)/timeconst.h + +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + +targets += timeconst.h +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) + diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fe75444ae7ec..4aec4a457431 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void) return ret; } - +EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev, struct class_interface *class_intf) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ba3e502c955a..2e949cc9c9f1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -32,6 +32,7 @@ #include <linux/kthread.h> #include "tick-internal.h" +#include "timekeeping_internal.h" void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, @@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs) static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; - cycle_t csnow, wdnow; + cycle_t csnow, wdnow, delta; int64_t wd_nsec, cs_nsec; int next_cpu, reset_pending; @@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data) continue; } - wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, - watchdog->mult, watchdog->shift); + delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); + wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, + watchdog->shift); - cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & - cs->mask, cs->mult, cs->shift); + delta = clocksource_delta(csnow, cs->cs_last, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); cs->cs_last = csnow; cs->wd_last = wdnow; diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c index 3ab28993f6e0..1c2fe7de2842 100644 --- a/kernel/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -54,6 +54,8 @@ #include <trace/events/timer.h> +#include "timekeeping.h" + /* * The timer bases: * @@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, mono, boot; - struct timespec xts, tom, slp; - s32 tai_offset; + ktime_t xtim, mono, boot, tai; + ktime_t off_real, off_boot, off_tai; - get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); - tai_offset = timekeeping_get_tai_offset(); + mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); + boot = ktime_add(mono, off_boot); + xtim = ktime_add(mono, off_real); + tai = ktime_add(xtim, off_tai); - xtim = timespec_to_ktime(xts); - mono = ktime_add(xtim, timespec_to_ktime(tom)); - boot = ktime_add(mono, timespec_to_ktime(slp)); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = - ktime_add(xtim, ktime_set(tai_offset, 0)); + base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; } /* @@ -264,60 +263,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * too large for inlining: */ #if BITS_PER_LONG < 64 -# ifndef CONFIG_KTIME_SCALAR -/** - * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * @kt: addend - * @nsec: the scalar nsec value to add - * - * Returns the sum of kt and nsec in ktime_t format - */ -ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - /* Make sure nsec fits into long */ - if (unlikely(nsec > KTIME_SEC_MAX)) - return (ktime_t){ .tv64 = KTIME_MAX }; - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_add(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_add_ns); - -/** - * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable - * @kt: minuend - * @nsec: the scalar nsec value to subtract - * - * Returns the subtraction of @nsec from @kt in ktime_t format - */ -ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_sub(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_sub_ns); -# endif /* !CONFIG_KTIME_SCALAR */ - /* * Divide a ktime value by a nanosecond value */ @@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div) return dclc; } +EXPORT_SYMBOL_GPL(ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* @@ -602,6 +548,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * + * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming + * and no expiry check happens. The timer gets enqueued into the rbtree. The + * reprogramming and expiry check is done in the hrtimer_interrupt or in the + * softirq. + * * Called with interrupts disabled and base->cpu_base.lock held */ static int hrtimer_reprogram(struct hrtimer *timer, @@ -662,25 +613,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -/* - * When High resolution timers are active, try to reprogram. Note, that in case - * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry - * check happens. The timer gets enqueued into the rbtree. The reprogramming - * and expiry check is done in the hrtimer_interrupt or in the softirq. - */ -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); -} - static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); + return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); } /* @@ -755,8 +694,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static inline int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { return 0; } @@ -1013,14 +952,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, leftmost = enqueue_hrtimer(timer, new_base); - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) - && hrtimer_enqueue_reprogram(timer, new_base)) { + if (!leftmost) { + unlock_hrtimer_base(timer, &flags); + return ret; + } + + if (!hrtimer_is_hres_active(timer)) { + /* + * Kick to reschedule the next tick to handle the new timer + * on dynticks target. + */ + wake_up_nohz_cpu(new_base->cpu_base->cpu); + } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && + hrtimer_reprogram(timer, new_base)) { + /* + * Only allow reprogramming if the new base is on this CPU. + * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? + */ if (wakeup) { /* * We need to drop cpu_base->lock to avoid a @@ -1680,6 +1630,7 @@ static void init_hrtimers_cpu(int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); } diff --git a/kernel/itimer.c b/kernel/time/itimer.c index 8d262b467573..8d262b467573 100644 --- a/kernel/itimer.c +++ b/kernel/time/itimer.c diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 33db43a39515..87a346fd6d61 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { - struct timespec now, next; + struct timespec64 now; + struct timespec next; int fail = 1; /* @@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work) return; } - getnstimeofday(&now); + getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec adjust = now; + struct timespec adjust = timespec64_to_timespec(now); fail = -ENODEV; if (persistent_clock_is_local) @@ -531,7 +532,7 @@ void ntp_notify_cmos_timer(void) { } /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(struct timex *txc, struct timespec *ts) +static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -554,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) static inline void process_adjtimex_modes(struct timex *txc, - struct timespec *ts, + struct timespec64 *ts, s32 *time_tai) { if (txc->modes & ADJ_STATUS) @@ -640,7 +641,7 @@ int ntp_validate_timex(struct timex *txc) * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) { int result; @@ -684,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) /* fill PPS status fields */ pps_fill_timex(txc); - txc->time.tv_sec = ts->tv_sec; + txc->time.tv_sec = (time_t)ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 1950cb4ca2a4..bbd102ad9df7 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -7,6 +7,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); -extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec *, const struct timespec *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..3b8946416a5f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c index 424c2d4265c9..42b463ad90f2 100644 --- a/kernel/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -49,6 +49,8 @@ #include <linux/export.h> #include <linux/hashtable.h> +#include "timekeeping.h" + /* * Management arrays for POSIX timers. Timers are now kept in static hash table * with 512 entries. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7ab92b19965a..c19c1d84b6f3 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include <linux/hrtimer.h> #include <linux/tick.h> +#include "timekeeping.h" + extern seqlock_t jiffies_lock; #define CS_NAME_LEN 32 diff --git a/kernel/time.c b/kernel/time/time.c index 7c7964c33ae7..f0294ba14634 100644 --- a/kernel/time.c +++ b/kernel/time/time.c @@ -42,6 +42,7 @@ #include <asm/unistd.h> #include "timeconst.h" +#include "timekeeping.h" /* * The timezone where the local system is located. Used as a default by some @@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); +#if BITS_PER_LONG == 32 +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec64); + +/** + * ns_to_timespec64 - Convert nanoseconds to timespec64 + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec64 representation of the nsec parameter. + */ +struct timespec64 ns_to_timespec64(const s64 nsec) +{ + struct timespec64 ts; + s32 rem; + + if (!nsec) + return (struct timespec64) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec64); +#endif /* * When we convert to jiffies then we interpret incoming values * the following way: @@ -694,6 +757,7 @@ unsigned long nsecs_to_jiffies(u64 n) { return (unsigned long)nsecs_to_jiffies64(n); } +EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /* * Add two timespec values and do a safety check for overflow. diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc index 511bdf2cafda..511bdf2cafda 100644 --- a/kernel/timeconst.bc +++ b/kernel/time/timeconst.bc diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 32d8d6aaedb8..fb4a9c2cf8d9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,11 +32,34 @@ #define TK_MIRROR (1 << 1) #define TK_CLOCK_WAS_SET (1 << 2) -static struct timekeeper timekeeper; +/* + * The most important data for readout fits into a single 64 byte + * cache line. + */ +static struct { + seqcount_t seq; + struct timekeeper timekeeper; +} tk_core ____cacheline_aligned; + static DEFINE_RAW_SPINLOCK(timekeeper_lock); -static seqcount_t timekeeper_seq; static struct timekeeper shadow_timekeeper; +/** + * struct tk_fast - NMI safe timekeeper + * @seq: Sequence counter for protecting updates. The lowest bit + * is the index for the tk_read_base array + * @base: tk_read_base array. Access is indexed by the lowest bit of + * @seq. + * + * See @update_fast_timekeeper() below. + */ +struct tk_fast { + seqcount_t seq; + struct tk_read_base base[2]; +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned; + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -45,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false; static inline void tk_normalize_xtime(struct timekeeper *tk) { - while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { - tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; + while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { + tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; tk->xtime_sec++; } } -static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +static inline struct timespec64 tk_xtime(struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); + return ts; +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; - tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; + tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; } -static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; - tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; tk_normalize_xtime(tk); } -static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { - struct timespec tmp; + struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ - set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); - WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64); tk->wall_to_monotonic = wtm; - set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec_to_ktime(tmp); + set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec64_to_ktime(tmp); tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } -static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - /* Verify consistency before modifying */ - WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); - - tk->total_sleep_time = t; - tk->offs_boot = timespec_to_ktime(t); + tk->offs_boot = ktime_add(tk->offs_boot, delta); } /** @@ -107,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; - old_clock = tk->clock; - tk->clock = clock; - tk->cycle_last = clock->cycle_last = clock->read(clock); + old_clock = tk->tkr.clock; + tk->tkr.clock = clock; + tk->tkr.read = clock->read; + tk->tkr.mask = clock->mask; + tk->tkr.cycle_last = tk->tkr.read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -133,78 +163,213 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) - tk->xtime_nsec >>= -shift_change; + tk->tkr.xtime_nsec >>= -shift_change; else - tk->xtime_nsec <<= shift_change; + tk->tkr.xtime_nsec <<= shift_change; } - tk->shift = clock->shift; + tk->tkr.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - tk->mult = clock->mult; + tk->tkr.mult = clock->mult; + tk->ntp_err_mult = 0; } /* Timekeeper helper functions. */ #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -u32 (*arch_gettimeoffset)(void); - -u32 get_arch_timeoffset(void) -{ - if (likely(arch_gettimeoffset)) - return arch_gettimeoffset(); - return 0; -} +static u32 default_arch_gettimeoffset(void) { return 0; } +u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; #else -static inline u32 get_arch_timeoffset(void) { return 0; } +static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_get_ns(struct timekeeper *tk) +static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; + cycle_t cycle_now, delta; s64 nsec; /* read clocksource: */ - clock = tk->clock; - cycle_now = clock->read(clock); + cycle_now = tkr->read(tkr->clock); /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); - nsec = cycle_delta * tk->mult + tk->xtime_nsec; - nsec >>= tk->shift; + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ - return nsec + get_arch_timeoffset(); + return nsec + arch_gettimeoffset(); } static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; + struct clocksource *clock = tk->tkr.clock; + cycle_t cycle_now, delta; s64 nsec; /* read clocksource: */ - clock = tk->clock; - cycle_now = clock->read(clock); + cycle_now = tk->tkr.read(clock); /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); /* convert delta to nanoseconds. */ - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); /* If arch requires, add in get_arch_timeoffset() */ - return nsec + get_arch_timeoffset(); + return nsec + arch_gettimeoffset(); +} + +/** + * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. + * @tk: The timekeeper from which we take the update + * @tkf: The fast timekeeper to update + * @tbase: The time base for the fast timekeeper (mono/raw) + * + * We want to use this from any context including NMI and tracing / + * instrumenting the timekeeping code itself. + * + * So we handle this differently than the other timekeeping accessor + * functions which retry when the sequence count has changed. The + * update side does: + * + * smp_wmb(); <- Ensure that the last base[1] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[0], tk); + * smp_wmb(); <- Ensure that the base[0] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[1], tk); + * + * The reader side does: + * + * do { + * seq = tkf->seq; + * smp_rmb(); + * idx = seq & 0x01; + * now = now(tkf->base[idx]); + * smp_rmb(); + * } while (seq != tkf->seq) + * + * As long as we update base[0] readers are forced off to + * base[1]. Once base[0] is updated readers are redirected to base[0] + * and the base[1] update takes place. + * + * So if a NMI hits the update of base[0] then it will use base[1] + * which is still consistent. In the worst case this can result is a + * slightly wrong timestamp (a few nanoseconds). See + * @ktime_get_mono_fast_ns. + */ +static void update_fast_timekeeper(struct timekeeper *tk) +{ + struct tk_read_base *base = tk_fast_mono.base; + + /* Force readers off to base[1] */ + raw_write_seqcount_latch(&tk_fast_mono.seq); + + /* Update base[0] */ + memcpy(base, &tk->tkr, sizeof(*base)); + + /* Force readers back to base[0] */ + raw_write_seqcount_latch(&tk_fast_mono.seq); + + /* Update base[1] */ + memcpy(base + 1, base, sizeof(*base)); } +/** + * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic + * + * This timestamp is not guaranteed to be monotonic across an update. + * The timestamp is calculated by: + * + * now = base_mono + clock_delta * slope + * + * So if the update lowers the slope, readers who are forced to the + * not yet updated second array are still using the old steeper slope. + * + * tmono + * ^ + * | o n + * | o n + * | u + * | o + * |o + * |12345678---> reader order + * + * o = old slope + * u = update + * n = new slope + * + * So reader 6 will observe time going backwards versus reader 5. + * + * While other CPUs are likely to be able observe that, the only way + * for a CPU local observation is when an NMI hits in the middle of + * the update. Timestamps taken from that NMI context might be ahead + * of the following timestamps. Callers need to be aware of that and + * deal with it. + */ +u64 notrace ktime_get_mono_fast_ns(void) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount(&tk_fast_mono.seq); + tkr = tk_fast_mono.base + (seq & 0x01); + now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); + + } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); + return now; +} +EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); + +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD + +static inline void update_vsyscall(struct timekeeper *tk) +{ + struct timespec xt, wm; + + xt = timespec64_to_timespec(tk_xtime(tk)); + wm = timespec64_to_timespec(tk->wall_to_monotonic); + update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, + tk->tkr.cycle_last); +} + +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); + tk->tkr.xtime_nsec -= remainder; + tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; +} +#else +#define old_vsyscall_fixup(tk) +#endif + static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) @@ -217,7 +382,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; int ret; @@ -247,6 +412,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); +/* + * Update the ktime_t based scalar nsec members of the timekeeper + */ +static inline void tk_update_ktime_data(struct timekeeper *tk) +{ + s64 nsec; + + /* + * The xtime based monotonic readout is: + * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); + * The ktime based monotonic readout is: + * nsec = base_mono + now(); + * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + */ + nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec *= NSEC_PER_SEC; + nsec += tk->wall_to_monotonic.tv_nsec; + tk->tkr.base_mono = ns_to_ktime(nsec); + + /* Update the monotonic raw base */ + tk->base_raw = timespec64_to_ktime(tk->raw_time); +} + /* must hold timekeeper_lock */ static void timekeeping_update(struct timekeeper *tk, unsigned int action) { @@ -257,8 +445,13 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + tk_update_ktime_data(tk); + if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); + + update_fast_timekeeper(tk); } /** @@ -270,49 +463,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; + struct clocksource *clock = tk->tkr.clock; + cycle_t cycle_now, delta; s64 nsec; - clock = tk->clock; - cycle_now = clock->read(clock); - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - tk->cycle_last = clock->cycle_last = cycle_now; + cycle_now = tk->tkr.read(clock); + delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); + tk->tkr.cycle_last = cycle_now; - tk->xtime_nsec += cycle_delta * tk->mult; + tk->tkr.xtime_nsec += delta * tk->tkr.mult; /* If arch requires, add in get_arch_timeoffset() */ - tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; + tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&tk->raw_time, nsec); + nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); + timespec64_add_ns(&tk->raw_time, nsec); } /** - * __getnstimeofday - Returns the time of day in a timespec. + * __getnstimeofday64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Updates the time of day in the timespec. * Returns 0 on success, or -ve when suspended (timespec will be undefined). */ -int __getnstimeofday(struct timespec *ts) +int __getnstimeofday64(struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; s64 nsecs = 0; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsecs = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(&tk->tkr); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; - timespec_add_ns(ts, nsecs); + timespec64_add_ns(ts, nsecs); /* * Do not bail out early, in case there were callers still using @@ -322,116 +514,138 @@ int __getnstimeofday(struct timespec *ts) return -EAGAIN; return 0; } -EXPORT_SYMBOL(__getnstimeofday); +EXPORT_SYMBOL(__getnstimeofday64); /** - * getnstimeofday - Returns the time of day in a timespec. + * getnstimeofday64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec (WARN if suspended). */ -void getnstimeofday(struct timespec *ts) +void getnstimeofday64(struct timespec64 *ts) { - WARN_ON(__getnstimeofday(ts)); + WARN_ON(__getnstimeofday64(ts)); } -EXPORT_SYMBOL(getnstimeofday); +EXPORT_SYMBOL(getnstimeofday64); ktime_t ktime_get(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; - s64 secs, nsecs; + ktime_t base; + s64 nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); - secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr.base_mono; + nsecs = timekeeping_get_ns(&tk->tkr); - } while (read_seqcount_retry(&timekeeper_seq, seq)); - /* - * Use ktime_set/ktime_add_ns to create a proper ktime on - * 32-bit architectures without CONFIG_KTIME_SCALAR. - */ - return ktime_add_ns(ktime_set(secs, 0), nsecs); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) +static ktime_t *offsets[TK_OFFS_MAX] = { + [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, + [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, +}; + +ktime_t ktime_get_with_offset(enum tk_offsets offs) { - struct timekeeper *tk = &timekeeper; - struct timespec tomono; - s64 nsec; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; + ktime_t base, *offset = offsets[offs]; + s64 nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); - ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(tk); - tomono = tk->wall_to_monotonic; + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr.base_mono, *offset); + nsecs = timekeeping_get_ns(&tk->tkr); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - ts->tv_sec += tomono.tv_sec; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_with_offset); /** - * timekeeping_clocktai - Returns the TAI time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. + * ktime_mono_to_any() - convert mononotic time to any other time + * @tmono: time to convert. + * @offs: which offset to use */ -void timekeeping_clocktai(struct timespec *ts) +ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { - struct timekeeper *tk = &timekeeper; + ktime_t *offset = offsets[offs]; unsigned long seq; - u64 nsecs; - - WARN_ON(timekeeping_suspended); + ktime_t tconv; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); + tconv = ktime_add(tmono, *offset); + } while (read_seqcount_retry(&tk_core.seq, seq)); - ts->tv_sec = tk->xtime_sec + tk->tai_offset; - nsecs = timekeeping_get_ns(tk); + return tconv; +} +EXPORT_SYMBOL_GPL(ktime_mono_to_any); - } while (read_seqcount_retry(&timekeeper_seq, seq)); +/** + * ktime_get_raw - Returns the raw monotonic time in ktime_t format + */ +ktime_t ktime_get_raw(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + s64 nsecs; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsecs); + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->base_raw; + nsecs = timekeeping_get_ns_raw(tk); -} -EXPORT_SYMBOL(timekeeping_clocktai); + } while (read_seqcount_retry(&tk_core.seq, seq)); + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_raw); /** - * ktime_get_clocktai - Returns the TAI time of day in a ktime + * ktime_get_ts64 - get the monotonic clock in timespec64 format + * @ts: pointer to timespec variable * - * Returns the time of day in a ktime. + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. */ -ktime_t ktime_get_clocktai(void) +void ktime_get_ts64(struct timespec64 *ts) { - struct timespec ts; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono; + s64 nsec; + unsigned int seq; + + WARN_ON(timekeeping_suspended); - timekeeping_clocktai(&ts); - return timespec_to_ktime(ts); + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(&tk->tkr); + tomono = tk->wall_to_monotonic; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); } -EXPORT_SYMBOL(ktime_get_clocktai); +EXPORT_SYMBOL_GPL(ktime_get_ts64); #ifdef CONFIG_NTP_PPS @@ -446,23 +660,23 @@ EXPORT_SYMBOL(ktime_get_clocktai); */ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; s64 nsecs_raw, nsecs_real; WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - *ts_raw = tk->raw_time; + *ts_raw = timespec64_to_timespec(tk->raw_time); ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; nsecs_raw = timekeeping_get_ns_raw(tk); - nsecs_real = timekeeping_get_ns(tk); + nsecs_real = timekeeping_get_ns(&tk->tkr); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -479,9 +693,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real); */ void do_gettimeofday(struct timeval *tv) { - struct timespec now; + struct timespec64 now; - getnstimeofday(&now); + getnstimeofday64(&now); tv->tv_sec = now.tv_sec; tv->tv_usec = now.tv_nsec/1000; } @@ -495,15 +709,15 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { - struct timekeeper *tk = &timekeeper; - struct timespec ts_delta, xt; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts_delta, xt, tmp; unsigned long flags; if (!timespec_valid_strict(tv)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -511,13 +725,14 @@ int do_settimeofday(const struct timespec *tv) ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); - tk_set_xtime(tk, tv); + tmp = timespec_to_timespec64(*tv); + tk_set_xtime(tk, &tmp); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -535,33 +750,35 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec tmp; + struct timespec64 ts64, tmp; int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; + ts64 = timespec_to_timespec64(*ts); + raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ - tmp = timespec_add(tk_xtime(tk), *ts); - if (!timespec_valid_strict(&tmp)) { + tmp = timespec64_add(tk_xtime(tk), ts64); + if (!timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } - tk_xtime_add(tk, ts); - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); + tk_xtime_add(tk, &ts64); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -578,14 +795,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset); */ s32 timekeeping_get_tai_offset(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; s32 ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ret = tk->tai_offset; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -606,14 +823,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) */ void timekeeping_set_tai_offset(s32 tai_offset) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); __timekeeping_set_tai_offset(tk, tai_offset); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clock_was_set(); } @@ -625,14 +842,14 @@ void timekeeping_set_tai_offset(s32 tai_offset) */ static int change_clocksource(void *data) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* @@ -641,7 +858,7 @@ static int change_clocksource(void *data) */ if (try_module_get(new->owner)) { if (!new->enable || new->enable(new) == 0) { - old = tk->clock; + old = tk->tkr.clock; tk_setup_internals(tk, new); if (old->disable) old->disable(old); @@ -652,7 +869,7 @@ static int change_clocksource(void *data) } timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; @@ -667,29 +884,14 @@ static int change_clocksource(void *data) */ int timekeeping_notify(struct clocksource *clock) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; - if (tk->clock == clock) + if (tk->tkr.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); - return tk->clock == clock ? 0 : -1; -} - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); + return tk->tkr.clock == clock ? 0 : -1; } -EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec @@ -699,18 +901,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real); */ void getrawmonotonic(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts64; unsigned long seq; s64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); nsecs = timekeeping_get_ns_raw(tk); - *ts = tk->raw_time; + ts64 = tk->raw_time; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec_add_ns(ts, nsecs); + timespec64_add_ns(&ts64, nsecs); + *ts = timespec64_to_timespec(ts64); } EXPORT_SYMBOL(getrawmonotonic); @@ -719,16 +923,16 @@ EXPORT_SYMBOL(getrawmonotonic); */ int timekeeping_valid_for_hres(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; int ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -738,16 +942,16 @@ int timekeeping_valid_for_hres(void) */ u64 timekeeping_max_deferment(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - ret = tk->clock->max_idle_ns; + ret = tk->tkr.clock->max_idle_ns; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -787,14 +991,15 @@ void __weak read_boot_clock(struct timespec *ts) */ void __init timekeeping_init(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec now, boot, tmp; - - read_persistent_clock(&now); + struct timespec64 now, boot, tmp; + struct timespec ts; - if (!timespec_valid_strict(&now)) { + read_persistent_clock(&ts); + now = timespec_to_timespec64(ts); + if (!timespec64_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; @@ -802,8 +1007,9 @@ void __init timekeeping_init(void) } else if (now.tv_sec || now.tv_nsec) persistent_clock_exist = true; - read_boot_clock(&boot); - if (!timespec_valid_strict(&boot)) { + read_boot_clock(&ts); + boot = timespec_to_timespec64(ts); + if (!timespec64_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); boot.tv_sec = 0; @@ -811,7 +1017,7 @@ void __init timekeeping_init(void) } raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); ntp_init(); clock = clocksource_default_clock(); @@ -822,24 +1028,21 @@ void __init timekeeping_init(void) tk_set_xtime(tk, &now); tk->raw_time.tv_sec = 0; tk->raw_time.tv_nsec = 0; + tk->base_raw.tv64 = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); - set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); - tmp.tv_sec = 0; - tmp.tv_nsec = 0; - tk_set_sleep_time(tk, tmp); - - memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + timekeeping_update(tk, TK_MIRROR); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began */ -static struct timespec timekeeping_suspend_time; +static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval @@ -849,17 +1052,17 @@ static struct timespec timekeeping_suspend_time; * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, - struct timespec *delta) + struct timespec64 *delta) { - if (!timespec_valid_strict(delta)) { + if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); - tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); + tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } @@ -875,7 +1078,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, */ void timekeeping_inject_sleeptime(struct timespec *delta) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tmp; unsigned long flags; /* @@ -886,15 +1090,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta) return; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); - __timekeeping_inject_sleeptime(tk, delta); + tmp = timespec_to_timespec64(*delta); + __timekeeping_inject_sleeptime(tk, &tmp); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -910,20 +1115,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { - struct timekeeper *tk = &timekeeper; - struct clocksource *clock = tk->clock; + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock = tk->tkr.clock; unsigned long flags; - struct timespec ts_new, ts_delta; + struct timespec64 ts_new, ts_delta; + struct timespec tmp; cycle_t cycle_now, cycle_delta; bool suspendtime_found = false; - read_persistent_clock(&ts_new); + read_persistent_clock(&tmp); + ts_new = timespec_to_timespec64(tmp); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); /* * After system resumes, we need to calculate the suspended time and @@ -937,15 +1144,16 @@ static void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = clock->read(clock); + cycle_now = tk->tkr.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && - cycle_now > clock->cycle_last) { + cycle_now > tk->tkr.cycle_last) { u64 num, max = ULLONG_MAX; u32 mult = clock->mult; u32 shift = clock->shift; s64 nsec = 0; - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, + tk->tkr.mask); /* * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -960,10 +1168,10 @@ static void timekeeping_resume(void) } nsec += ((u64) cycle_delta * mult) >> shift; - ts_delta = ns_to_timespec(nsec); + ts_delta = ns_to_timespec64(nsec); suspendtime_found = true; - } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { - ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); + } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); suspendtime_found = true; } @@ -971,11 +1179,11 @@ static void timekeeping_resume(void) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ - tk->cycle_last = clock->cycle_last = cycle_now; + tk->tkr.cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); @@ -988,12 +1196,14 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec delta, delta_delta; - static struct timespec old_delta; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; + struct timespec tmp; - read_persistent_clock(&timekeeping_suspend_time); + read_persistent_clock(&tmp); + timekeeping_suspend_time = timespec_to_timespec64(tmp); /* * On some systems the persistent_clock can not be detected at @@ -1004,7 +1214,7 @@ static int timekeeping_suspend(void) persistent_clock_exist = true; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -1014,8 +1224,8 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); - delta_delta = timespec_sub(delta, old_delta); + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction @@ -1025,11 +1235,11 @@ static int timekeeping_suspend(void) } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = - timespec_add(timekeeping_suspend_time, delta_delta); + timespec64_add(timekeeping_suspend_time, delta_delta); } timekeeping_update(tk, TK_MIRROR); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); @@ -1050,125 +1260,34 @@ static int __init timekeeping_init_ops(void) register_syscore_ops(&timekeeping_syscore_ops); return 0; } - device_initcall(timekeeping_init_ops); /* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. + * Apply a multiplier adjustment to the timekeeper */ -static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, - s64 error, s64 *interval, - s64 *offset) +static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, + s64 offset, + bool negative, + int adj_scale) { - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adjusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; + s64 interval = tk->cycle_interval; + s32 mult_adj = 1; - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); - tick_error -= tk->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; + if (negative) { + mult_adj = -mult_adj; + interval = -interval; + offset = -offset; } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void timekeeping_adjust(struct timekeeper *tk, s64 offset) -{ - s64 error, interval = tk->cycle_interval; - int adj; + mult_adj <<= adj_scale; + interval <<= adj_scale; + offset <<= adj_scale; /* - * The point of this is to check if the error is greater than half - * an interval. - * - * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. - * - * Note we subtract one in the shift, so that error is really error*2. - * This "saves" dividing(shifting) interval twice, but keeps the - * (error > interval) comparison as still measuring if error is - * larger than half an interval. - * - * Note: It does not "save" on aggravation when reading the code. - */ - error = tk->ntp_error >> (tk->ntp_error_shift - 1); - if (error > interval) { - /* - * We now divide error by 4(via shift), which checks if - * the error is greater than twice the interval. - * If it is greater, we need a bigadjust, if its smaller, - * we can adjust by 1. - */ - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = timekeeping_bigadjust(tk, error, &interval, &offset); - } else { - if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else { - adj = timekeeping_bigadjust(tk, error, &interval, &offset); - } - } else { - goto out_adjust; - } - } - - if (unlikely(tk->clock->maxadj && - (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_deferred_once(KERN_WARNING - "Adjusting %s more than 11%% (%ld vs %ld)\n", - tk->clock->name, (long)tk->mult + adj, - (long)tk->clock->mult + tk->clock->maxadj); - } - /* * So the following can be confusing. * - * To keep things simple, lets assume adj == 1 for now. + * To keep things simple, lets assume mult_adj == 1 for now. * - * When adj != 1, remember that the interval and offset values + * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier @@ -1212,12 +1331,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * * XXX - TODO: Doc ntp_error calculation. */ - tk->mult += adj; + tk->tkr.mult += mult_adj; tk->xtime_interval += interval; - tk->xtime_nsec -= offset; + tk->tkr.xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +} + +/* + * Calculate the multiplier adjustment needed to match the frequency + * specified by NTP + */ +static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, + s64 offset) +{ + s64 interval = tk->cycle_interval; + s64 xinterval = tk->xtime_interval; + s64 tick_error; + bool negative; + u32 adj; + + /* Remove any current error adj from freq calculation */ + if (tk->ntp_err_mult) + xinterval -= tk->cycle_interval; + + tk->ntp_tick = ntp_tick_length(); + + /* Calculate current error per tick */ + tick_error = ntp_tick_length() >> tk->ntp_error_shift; + tick_error -= (xinterval + tk->xtime_remainder); + + /* Don't worry about correcting it if its small */ + if (likely((tick_error >= 0) && (tick_error <= interval))) + return; + + /* preserve the direction of correction */ + negative = (tick_error < 0); + + /* Sort out the magnitude of the correction */ + tick_error = abs(tick_error); + for (adj = 0; tick_error > interval; adj++) + tick_error >>= 1; + + /* scale the corrections */ + timekeeping_apply_adjustment(tk, offset, negative, adj); +} + +/* + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. + */ +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +{ + /* Correct for the current frequency error */ + timekeeping_freqadjust(tk, offset); + + /* Next make a small adjustment to fix any cumulative error */ + if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { + tk->ntp_err_mult = 1; + timekeeping_apply_adjustment(tk, offset, 0, 0); + } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { + /* Undo any existing error adjustment */ + timekeeping_apply_adjustment(tk, offset, 1, 0); + tk->ntp_err_mult = 0; + } + + if (unlikely(tk->tkr.clock->maxadj && + (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", + tk->tkr.clock->name, (long)tk->tkr.mult, + (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); + } -out_adjust: /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource @@ -1232,12 +1417,11 @@ out_adjust: * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)tk->xtime_nsec < 0)) { - s64 neg = -(s64)tk->xtime_nsec; - tk->xtime_nsec = 0; + if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { + s64 neg = -(s64)tk->tkr.xtime_nsec; + tk->tkr.xtime_nsec = 0; tk->ntp_error += neg << tk->ntp_error_shift; } - } /** @@ -1250,26 +1434,26 @@ out_adjust: */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { - u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; unsigned int clock_set = 0; - while (tk->xtime_nsec >= nsecps) { + while (tk->tkr.xtime_nsec >= nsecps) { int leap; - tk->xtime_nsec -= nsecps; + tk->tkr.xtime_nsec -= nsecps; tk->xtime_sec++; /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { - struct timespec ts; + struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, - timespec_sub(tk->wall_to_monotonic, ts)); + timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); @@ -1301,9 +1485,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, /* Accumulate one shifted interval */ offset -= interval; - tk->cycle_last += interval; + tk->tkr.cycle_last += interval; - tk->xtime_nsec += tk->xtime_interval << shift; + tk->tkr.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ @@ -1317,48 +1501,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - tk->ntp_error += ntp_tick_length() << shift; + tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } -#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD -static inline void old_vsyscall_fixup(struct timekeeper *tk) -{ - s64 remainder; - - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD - * users are removed, this can be killed. - */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; - tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; -} -#else -#define old_vsyscall_fixup(tk) -#endif - - - /** * update_wall_time - Uses the current clocksource to increment the wall time * */ void update_wall_time(void) { - struct clocksource *clock; - struct timekeeper *real_tk = &timekeeper; + struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; @@ -1371,12 +1527,11 @@ void update_wall_time(void) if (unlikely(timekeeping_suspended)) goto out; - clock = real_tk->clock; - #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; + offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), + tk->tkr.cycle_last, tk->tkr.mask); #endif /* Check if there's really nothing to do */ @@ -1418,9 +1573,7 @@ void update_wall_time(void) */ clock_set |= accumulate_nsecs_to_secs(tk); - write_seqcount_begin(&timekeeper_seq); - /* Update clock->cycle_last with the new value */ - clock->cycle_last = tk->cycle_last; + write_seqcount_begin(&tk_core.seq); /* * Update the real timekeeper. * @@ -1428,12 +1581,12 @@ void update_wall_time(void) * requires changes to all other timekeeper usage sites as * well, i.e. move the timekeeper pointer getter into the * spinlocked/seqcount protected sections. And we trade this - * memcpy under the timekeeper_seq against one before we start + * memcpy under the tk_core.seq against one before we start * updating. */ memcpy(real_tk, tk, sizeof(*tk)); timekeeping_update(real_tk, clock_set); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (clock_set) @@ -1454,83 +1607,16 @@ out: */ void getboottime(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; - struct timespec boottime = { - .tv_sec = tk->wall_to_monotonic.tv_sec + - tk->total_sleep_time.tv_sec, - .tv_nsec = tk->wall_to_monotonic.tv_nsec + - tk->total_sleep_time.tv_nsec - }; - - set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); -} -EXPORT_SYMBOL_GPL(getboottime); - -/** - * get_monotonic_boottime - Returns monotonic time since boot - * @ts: pointer to the timespec to be set - * - * Returns the monotonic time since boot in a timespec. - * - * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also - * includes the time spent in suspend. - */ -void get_monotonic_boottime(struct timespec *ts) -{ - struct timekeeper *tk = &timekeeper; - struct timespec tomono, sleep; - s64 nsec; - unsigned int seq; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&timekeeper_seq); - ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(tk); - tomono = tk->wall_to_monotonic; - sleep = tk->total_sleep_time; - - } while (read_seqcount_retry(&timekeeper_seq, seq)); - - ts->tv_sec += tomono.tv_sec + sleep.tv_sec; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); -} -EXPORT_SYMBOL_GPL(get_monotonic_boottime); - -/** - * ktime_get_boottime - Returns monotonic time since boot in a ktime - * - * Returns the monotonic time since boot in a ktime - * - * This is similar to CLOCK_MONTONIC/ktime_get, but also - * includes the time spent in suspend. - */ -ktime_t ktime_get_boottime(void) -{ - struct timespec ts; - - get_monotonic_boottime(&ts); - return timespec_to_ktime(ts); -} -EXPORT_SYMBOL_GPL(ktime_get_boottime); - -/** - * monotonic_to_bootbased - Convert the monotonic time to boot based. - * @ts: pointer to the timespec to be converted - */ -void monotonic_to_bootbased(struct timespec *ts) -{ - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); - *ts = timespec_add(*ts, tk->total_sleep_time); + *ts = ktime_to_timespec(t); } -EXPORT_SYMBOL_GPL(monotonic_to_bootbased); +EXPORT_SYMBOL_GPL(getboottime); unsigned long get_seconds(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } @@ -1538,43 +1624,44 @@ EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; - return tk_xtime(tk); + return timespec64_to_timespec(tk_xtime(tk)); } struct timespec current_kernel_time(void) { - struct timekeeper *tk = &timekeeper; - struct timespec now; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now; unsigned long seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - return now; + return timespec64_to_timespec(now); } EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { - struct timekeeper *tk = &timekeeper; - struct timespec now, mono; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now, mono; unsigned long seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); - return now; + + return timespec64_to_timespec(now); } /* @@ -1587,29 +1674,38 @@ void do_timer(unsigned long ticks) } /** - * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, - * and sleep offsets. - * @xtim: pointer to timespec to be set with xtime - * @wtom: pointer to timespec to be set with wall_to_monotonic - * @sleep: pointer to timespec to be set with time in suspend + * ktime_get_update_offsets_tick - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns monotonic time at last tick and various offsets */ -void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, - struct timespec *wtom, struct timespec *sleep) +ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) { - struct timekeeper *tk = &timekeeper; - unsigned long seq; + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); - *xtim = tk_xtime(tk); - *wtom = tk->wall_to_monotonic; - *sleep = tk->total_sleep_time; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + seq = read_seqcount_begin(&tk_core.seq); + + base = tk->tkr.base_mono; + nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; + + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); } #ifdef CONFIG_HIGH_RES_TIMERS /** - * ktime_get_update_offsets - hrtimer helper + * ktime_get_update_offsets_now - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset @@ -1617,57 +1713,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * Returns current monotonic time and updates the offsets * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, +ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { - struct timekeeper *tk = &timekeeper; - ktime_t now; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; - u64 secs, nsecs; + ktime_t base; + u64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - secs = tk->xtime_sec; - nsecs = timekeeping_get_ns(tk); + base = tk->tkr.base_mono; + nsecs = timekeeping_get_ns(&tk->tkr); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - now = ktime_add_ns(ktime_set(secs, 0), nsecs); - now = ktime_sub(now, *offs_real); - return now; + return ktime_add_ns(base, nsecs); } #endif /** - * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format - */ -ktime_t ktime_get_monotonic_offset(void) -{ - struct timekeeper *tk = &timekeeper; - unsigned long seq; - struct timespec wtom; - - do { - seq = read_seqcount_begin(&timekeeper_seq); - wtom = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); - - return timespec_to_ktime(wtom); -} -EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - -/** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ int do_adjtimex(struct timex *txc) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec ts; + struct timespec64 ts; s32 orig_tai, tai; int ret; @@ -1687,10 +1763,10 @@ int do_adjtimex(struct timex *txc) return ret; } - getnstimeofday(&ts); + getnstimeofday64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; ret = __do_adjtimex(txc, &ts, &tai); @@ -1699,7 +1775,7 @@ int do_adjtimex(struct timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (tai != orig_tai) @@ -1719,11 +1795,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); __hardpps(phase_ts, raw_ts); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 000000000000..adc1fc98bde3 --- /dev/null +++ b/kernel/time/timekeeping.h @@ -0,0 +1,20 @@ +#ifndef _KERNEL_TIME_TIMEKEEPING_H +#define _KERNEL_TIME_TIMEKEEPING_H +/* + * Internal interfaces for kernel/time/ + */ +extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); + +extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); +extern int timekeeping_inject_offset(struct timespec *ts); +extern s32 timekeeping_get_tai_offset(void); +extern void timekeeping_set_tai_offset(s32 tai_offset); +extern void timekeeping_clocktai(struct timespec *ts); + +#endif diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 4d54f97558df..f6bd65236712 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void) } late_initcall(tk_debug_sleep_time_init); -void tk_debug_account_sleep_time(struct timespec *t) +void tk_debug_account_sleep_time(struct timespec64 *t) { sleep_time_bin[fls(t->tv_sec)]++; } diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 13323ea08ffa..4ea005a7f9da 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -3,12 +3,27 @@ /* * timekeeping debug functions */ +#include <linux/clocksource.h> #include <linux/time.h> #ifdef CONFIG_DEBUG_FS -extern void tk_debug_account_sleep_time(struct timespec *t); +extern void tk_debug_account_sleep_time(struct timespec64 *t); #else #define tk_debug_account_sleep_time(x) #endif +#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + cycle_t ret = (now - last) & mask; + + return (s64) ret > 0 ? ret : 0; +} +#else +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + return (now - last) & mask; +} +#endif + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/timer.c b/kernel/time/timer.c index 3bb01a323b2a..aca5dfe2fa3d 100644 --- a/kernel/timer.c +++ b/kernel/time/timer.c @@ -82,6 +82,7 @@ struct tvec_base { unsigned long next_timer; unsigned long active_timers; unsigned long all_timers; + int cpu; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -409,6 +410,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) base->next_timer = timer->expires; } base->all_timers++; + + /* + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. + */ + if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); } #ifdef CONFIG_TIMER_STATS @@ -948,22 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu) timer_set_base(timer, base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); - /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. - * We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to stop its tick can not - * evaluate the timer wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. - */ - if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) - wake_up_nohz_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1568,6 +1569,7 @@ static int init_timers_cpu(int cpu) } spin_lock_init(&base->lock); tvec_base_done[cpu] = 1; + base->cpu = cpu; } else { base = per_cpu(tvec_bases, cpu); } diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c new file mode 100644 index 000000000000..e622ba365a13 --- /dev/null +++ b/kernel/time/udelay_test.c @@ -0,0 +1,168 @@ +/* + * udelay() test kernel module + * + * Test is executed by writing and reading to /sys/kernel/debug/udelay_test + * Tests are configured by writing: USECS ITERATIONS + * Tests are executed by reading from the same file. + * Specifying usecs of 0 or negative values will run multiples tests. + * + * Copyright (C) 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#define DEFAULT_ITERATIONS 100 + +#define DEBUGFS_FILENAME "udelay_test" + +static DEFINE_MUTEX(udelay_test_lock); +static struct dentry *udelay_test_debugfs_file; +static int udelay_test_usecs; +static int udelay_test_iterations = DEFAULT_ITERATIONS; + +static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) +{ + int min = 0, max = 0, fail_count = 0; + uint64_t sum = 0; + uint64_t avg; + int i; + /* Allow udelay to be up to 0.5% fast */ + int allowed_error_ns = usecs * 5; + + for (i = 0; i < iters; ++i) { + struct timespec ts1, ts2; + int time_passed; + + ktime_get_ts(&ts1); + udelay(usecs); + ktime_get_ts(&ts2); + time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + + if (i == 0 || time_passed < min) + min = time_passed; + if (i == 0 || time_passed > max) + max = time_passed; + if ((time_passed + allowed_error_ns) / 1000 < usecs) + ++fail_count; + WARN_ON(time_passed < 0); + sum += time_passed; + } + + avg = sum; + do_div(avg, iters); + seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", + usecs, iters, usecs * 1000, + (usecs * 1000) - allowed_error_ns, min, avg, max); + if (fail_count) + seq_printf(s, " FAIL=%d", fail_count); + seq_puts(s, "\n"); + + return 0; +} + +static int udelay_test_show(struct seq_file *s, void *v) +{ + int usecs; + int iters; + int ret = 0; + + mutex_lock(&udelay_test_lock); + usecs = udelay_test_usecs; + iters = udelay_test_iterations; + mutex_unlock(&udelay_test_lock); + + if (usecs > 0 && iters > 0) { + return udelay_test_single(s, usecs, iters); + } else if (usecs == 0) { + struct timespec ts; + + ktime_get_ts(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", + loops_per_jiffy, ts.tv_sec, ts.tv_nsec); + seq_puts(s, "usage:\n"); + seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); + seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); + } + + return ret; +} + +static int udelay_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, udelay_test_show, inode->i_private); +} + +static ssize_t udelay_test_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + char lbuf[32]; + int ret; + int usecs; + int iters; + + if (count >= sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + lbuf[count] = '\0'; + + ret = sscanf(lbuf, "%d %d", &usecs, &iters); + if (ret < 1) + return -EINVAL; + else if (ret < 2) + iters = DEFAULT_ITERATIONS; + + mutex_lock(&udelay_test_lock); + udelay_test_usecs = usecs; + udelay_test_iterations = iters; + mutex_unlock(&udelay_test_lock); + + return count; +} + +static const struct file_operations udelay_test_debugfs_ops = { + .owner = THIS_MODULE, + .open = udelay_test_open, + .read = seq_read, + .write = udelay_test_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init udelay_test_init(void) +{ + mutex_lock(&udelay_test_lock); + udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, + S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); + mutex_unlock(&udelay_test_lock); + + return 0; +} + +module_init(udelay_test_init); + +static void __exit udelay_test_exit(void) +{ + mutex_lock(&udelay_test_lock); + debugfs_remove(udelay_test_debugfs_file); + mutex_unlock(&udelay_test_lock); +} + +module_exit(udelay_test_exit); + +MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 925f629658d6..afb04b9b818a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1968,7 +1968,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) /** * rb_update_event - update event type and data - * @event: the even to update + * @event: the event to update * @type: the type of event * @length: the size of the event field in the ring buffer * @@ -3341,21 +3341,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ - if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = rb_set_head_page(cpu_buffer); - if (unlikely(!iter->head_page)) - return; - iter->head = iter->head_page->read; - } else { - iter->head_page = cpu_buffer->reader_page; - iter->head = cpu_buffer->reader_page->read; - } + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + + iter->cache_reader_page = iter->head_page; + iter->cache_read = iter->head; + if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; - iter->cache_reader_page = cpu_buffer->reader_page; - iter->cache_read = cpu_buffer->read; } /** @@ -3748,12 +3743,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a time extend is encountered. - * Since the time extend is always attached to a data event, - * we should never loop more than once. - * (We never hit the following condition more than twice). + * We repeat when a time extend is encountered or we hit + * the end of the page. Since the time extend is always attached + * to a data event, we should never loop more than three times. + * Once for going to next page, once on time extend, and + * finally once to get the event. + * (We never hit the following condition more than thrice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bb80fe08767..8a528392b1f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -820,11 +820,12 @@ static struct { const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { - { trace_clock_local, "local", 1 }, - { trace_clock_global, "global", 1 }, - { trace_clock_counter, "counter", 0 }, - { trace_clock_jiffies, "uptime", 0 }, - { trace_clock, "perf", 1 }, + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 0 }, + { trace_clock, "perf", 1 }, + { ktime_get_mono_fast_ns, "mono", 1 }, ARCH_TRACE_CLOCKS }; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index a1dd9a1b1327..975cb49e32bf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns, struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; - struct timespec uptime, ts; cputime_t utime, stime, utimescaled, stimescaled; - u64 ac_etime; + u64 delta; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); - /* calculate task elapsed time in timespec */ - do_posix_clock_monotonic_gettime(&uptime); - ts = timespec_sub(uptime, tsk->start_time); - /* rebase elapsed time to usec (should never be negative) */ - ac_etime = timespec_to_ns(&ts); - do_div(ac_etime, NSEC_PER_USEC); - stats->ac_etime = ac_etime; - stats->ac_btime = get_seconds() - ts.tv_sec; + /* calculate task elapsed time in nsec */ + delta = ktime_get_ns() - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime = delta; + /* Convert to seconds for btime */ + do_div(delta, USEC_PER_SEC); + stats->ac_btime = get_seconds() - delta; if (thread_group_leader(tsk)) { stats->ac_exitcode = tsk->exit_code; if (tsk->flags & PF_FORKNOEXEC) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc02560fd6b..aa312b0dc3ec 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) return; } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, diff --git a/kernel/utsname.c b/kernel/utsname.c index fd393124e507..883aaaa7de8a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task) struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->uts_ns; get_uts_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..a8d6914030fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) smp_mb__after_atomic(); } + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", |