diff options
Diffstat (limited to 'kernel')
56 files changed, 1810 insertions, 1061 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 2f4964cfde0b..a871bf80fde1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -160,7 +160,6 @@ static LIST_HEAD(audit_freelist); /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; -static void kauditd_hold_skb(struct sk_buff *skb); /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ @@ -454,30 +453,6 @@ static void auditd_set(int pid, u32 portid, struct net *net) } /** - * auditd_reset - Disconnect the auditd connection - * - * Description: - * Break the auditd/kauditd connection and move all the queued records into the - * hold queue in case auditd reconnects. - */ -static void auditd_reset(void) -{ - struct sk_buff *skb; - - /* if it isn't already broken, break the connection */ - rcu_read_lock(); - if (auditd_conn.pid) - auditd_set(0, 0, NULL); - rcu_read_unlock(); - - /* flush all of the main and retry queues to the hold queue */ - while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); - while ((skb = skb_dequeue(&audit_queue))) - kauditd_hold_skb(skb); -} - -/** * kauditd_print_skb - Print the audit record to the ring buffer * @skb: audit record * @@ -505,9 +480,6 @@ static void kauditd_rehold_skb(struct sk_buff *skb) { /* put the record back in the queue at the same place */ skb_queue_head(&audit_hold_queue, skb); - - /* fail the auditd connection */ - auditd_reset(); } /** @@ -544,9 +516,6 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); kfree_skb(skb); - - /* fail the auditd connection */ - auditd_reset(); } /** @@ -567,6 +536,30 @@ static void kauditd_retry_skb(struct sk_buff *skb) } /** + * auditd_reset - Disconnect the auditd connection + * + * Description: + * Break the auditd/kauditd connection and move all the queued records into the + * hold queue in case auditd reconnects. + */ +static void auditd_reset(void) +{ + struct sk_buff *skb; + + /* if it isn't already broken, break the connection */ + rcu_read_lock(); + if (auditd_conn.pid) + auditd_set(0, 0, NULL); + rcu_read_unlock(); + + /* flush all of the main and retry queues to the hold queue */ + while ((skb = skb_dequeue(&audit_retry_queue))) + kauditd_hold_skb(skb); + while ((skb = skb_dequeue(&audit_queue))) + kauditd_hold_skb(skb); +} + +/** * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record * @@ -758,6 +751,7 @@ static int kauditd_thread(void *dummy) NULL, kauditd_rehold_skb); if (rc < 0) { sk = NULL; + auditd_reset(); goto main_queue; } @@ -767,6 +761,7 @@ static int kauditd_thread(void *dummy) NULL, kauditd_hold_skb); if (rc < 0) { sk = NULL; + auditd_reset(); goto main_queue; } @@ -775,16 +770,18 @@ main_queue: * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the * multicast send and move the record to the retry queue */ - kauditd_send_queue(sk, portid, &audit_queue, 1, - kauditd_send_multicast_skb, - kauditd_retry_skb); + rc = kauditd_send_queue(sk, portid, &audit_queue, 1, + kauditd_send_multicast_skb, + kauditd_retry_skb); + if (sk == NULL || rc < 0) + auditd_reset(); + sk = NULL; /* drop our netns reference, no auditd sends past this line */ if (net) { put_net(net); net = NULL; } - sk = NULL; /* we have processed all the queues so wake everyone */ wake_up(&audit_backlog_wait); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f45827e205d3..b4f1cb0c5ac7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1162,12 +1162,12 @@ out: LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ off = IMM; load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are - * only appearing in the programs where ctx == - * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] - * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, - * internal BPF verifier will check that BPF_R6 == - * ctx. + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only + * appearing in the programs where ctx == skb + * (see may_access_skb() in the verifier). All programs + * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, + * bpf_convert_filter() saves it in BPF_R6, internal BPF + * verifier will check that BPF_R6 == ctx. * * BPF_ABS and BPF_IND are wrappers of function calls, * so they scratch BPF_R1-BPF_R5 registers, preserve diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7af0dcc5d755..821f9e807de5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -617,6 +617,14 @@ static void fixup_bpf_calls(struct bpf_prog *prog) if (insn->imm == BPF_FUNC_xdp_adjust_head) prog->xdp_adjust_head = 1; if (insn->imm == BPF_FUNC_tail_call) { + /* If we tail call into other programs, we + * cannot make any assumptions since they + * can be replaced dynamically during runtime + * in the program array. + */ + prog->cb_access = 1; + prog->xdp_adjust_head = 1; + /* mark bpf_tail_call as different opcode * to avoid conditional branch in * interpeter for every normal call diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 9203bfb05603..00f4d6bf048f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -5,6 +5,7 @@ #include <linux/kernfs.h> #include <linux/workqueue.h> #include <linux/list.h> +#include <linux/refcount.h> /* * A cgroup can be associated with multiple css_sets as different tasks may @@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset) * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ - if (atomic_add_unless(&cset->refcount, -1, 1)) + if (refcount_dec_not_one(&cset->refcount)) return; spin_lock_irqsave(&css_set_lock, flags); @@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset) */ static inline void get_css_set(struct css_set *cset) { - atomic_inc(&cset->refcount); + refcount_inc(&cset->refcount); } bool cgroup_ssid_enabled(int ssid); @@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 1dc22f6b49f5..85d75152402d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp) spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += atomic_read(&link->cset->refcount); + count += refcount_read(&link->cset->refcount); spin_unlock_irq(&css_set_lock); return count; } @@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, struct cgroup_subsys *ss; struct dentry *dentry; int i, ret; + bool new_root = false; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, ret = -ENOMEM; goto out_unlock; } + new_root = true; init_cgroup_root(root, &opts); - ret = cgroup_setup_root(root, opts.subsys_mask); + ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); if (ret) cgroup_free_root(root); @@ -1201,6 +1203,18 @@ out_free: CGROUP_SUPER_MAGIC, ns); /* + * There's a race window after we release cgroup_mutex and before + * allocating a superblock. Make sure a concurrent process won't + * be able to re-use the root during this window by delaying the + * initialization of root refcnt. + */ + if (new_root) { + mutex_lock(&cgroup_mutex); + percpu_ref_reinit(&root->cgrp.self.refcnt); + mutex_unlock(&cgroup_mutex); + } + + /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. */ @@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, u64 count; rcu_read_lock(); - count = atomic_read(&task_css_set(current)->refcount); + count = refcount_read(&task_css_set(current)->refcount); rcu_read_unlock(); return count; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 48851327a15e..c3c9a0e1b3c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = { .counter = 2, }, + .count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, @@ -436,7 +436,12 @@ out_unlock: return css; } -static void cgroup_get(struct cgroup *cgrp) +static void __maybe_unused cgroup_get(struct cgroup *cgrp) +{ + css_get(&cgrp->self); +} + +static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); @@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css); * haven't been created. */ struct css_set init_css_set = { - .refcount = ATOMIC_INIT(1), + .refcount = REFCOUNT_INIT(1), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), @@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset) lockdep_assert_held(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) + if (!refcount_dec_and_test(&cset->refcount)) return; /* This css_set is dead. unlink it and release cgroup and css refs */ @@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) - cgroup_get(cgrp); + cgroup_get_live(cgrp); } /** @@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, return NULL; } - atomic_set(&cset->refcount, 1); + refcount_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->task_iters); @@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, - GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, + ref_flags, GFP_KERNEL); if (ret) goto out; @@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return ERR_PTR(-EINVAL); } cgrp_dfl_visible = true; - cgroup_get(&cgrp_dfl_root.cgrp); + cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); @@ -2425,11 +2430,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_NO_SETAFFINITY and become - * trapped in a cpuset, or RT worker may be born in a cgroup - * with no rt_runtime allocated. Just say no. + * kthreads may acquire PF_NO_SETAFFINITY during initialization. + * If userland migrates such a kthread to a non-root cgroup, it can + * become trapped in a cpuset, or RT kthread may be born in a + * cgroup with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { + if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out_unlock_rcu; } @@ -2575,7 +2581,7 @@ restart: if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; - cgroup_get(dsct); + cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); @@ -3946,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, { lockdep_assert_held(&cgroup_mutex); - cgroup_get(cgrp); + cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; @@ -4122,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); - cgroup_get(parent); + cgroup_get_live(parent); /* * @cgrp is now fully operational. If something fails after this @@ -4512,7 +4518,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); mutex_unlock(&cgroup_mutex); @@ -4946,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path) if (kn) { if (kernfs_type(kn) == KERNFS_DIR) { cgrp = kn->priv; - cgroup_get(cgrp); + cgroup_get_live(cgrp); } else { cgrp = ERR_PTR(-ENOTDIR); } @@ -5026,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) /* Socket clone path */ if (skcd->val) { + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ cgroup_get(sock_cgroup_ptr(skcd)); return; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f41292be0fb..f6501f4f6040 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2121,10 +2121,8 @@ int __init cpuset_init(void) { int err = 0; - if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) - BUG(); - if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); @@ -2139,8 +2137,7 @@ int __init cpuset_init(void) if (err < 0) return err; - if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); return 0; } @@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rebuild_sched_domains(); } -void cpuset_update_active_cpus(bool cpu_online) +void cpuset_update_active_cpus(void) { /* * We're inside cpu hotplug critical region which usually nests diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 96d38dab6fb2..66129eb4371d 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) kfree(new_ns); return ERR_PTR(ret); } - atomic_set(&new_ns->count, 1); + refcount_set(&new_ns->count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } diff --git a/kernel/compat.c b/kernel/compat.c index 19aec5d98108..933bcb31ae10 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, struct timezone __user *, tz) { + struct timespec64 new_ts; struct timeval user_tv; - struct timespec new_ts; struct timezone new_tz; if (tv) { @@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, return -EFAULT; } - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) @@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { struct timespec tu, rmt; + struct timespec64 tu64; mm_segment_t oldfs; long ret; if (compat_get_timespec(&tu, rqtp)) return -EFAULT; - if (!timespec_valid(&tu)) + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) return -EINVAL; oldfs = get_fs(); set_fs(KERNEL_DS); - ret = hrtimer_nanosleep(&tu, + ret = hrtimer_nanosleep(&tu64, rmtp ? (struct timespec __user *)&rmt : NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); diff --git a/kernel/cpu.c b/kernel/cpu.c index 37b223e4fc05..9ae6fbe5b5cf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ +int __boot_cpu_id; + #endif /* CONFIG_SMP */ /* Boot processor state steps */ @@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void) set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); + +#ifdef CONFIG_SMP + __boot_cpu_id = cpu; +#endif } /* diff --git a/kernel/fork.c b/kernel/fork.c index afa2947286cd..3a4343cdfe90 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; + p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif } diff --git a/kernel/futex.c b/kernel/futex.c index 45858ec73941..357348a6cf6b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -802,7 +802,7 @@ static int refill_pi_state_cache(void) return 0; } -static struct futex_pi_state * alloc_pi_state(void) +static struct futex_pi_state *alloc_pi_state(void) { struct futex_pi_state *pi_state = current->pi_state_cache; @@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +static void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); +} + /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. @@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * Look up the task based on what TID userspace gave us. * We dont trust it. */ -static struct task_struct * futex_find_get_task(pid_t pid) +static struct task_struct *futex_find_get_task(pid_t pid) { struct task_struct *p; @@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr) pi_state->owner = NULL; raw_spin_unlock_irq(&curr->pi_lock); - rt_mutex_unlock(&pi_state->pi_mutex); - + get_pi_state(pi_state); spin_unlock(&hb->lock); + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); @@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr) * * [10] There is no transient state which leaves owner and user space * TID out of sync. + * + * + * Serialization and lifetime rules: + * + * hb->lock: + * + * hb -> futex_q, relation + * futex_q -> pi_state, relation + * + * (cannot be raw because hb can contain arbitrary amount + * of futex_q's) + * + * pi_mutex->wait_lock: + * + * {uval, pi_state} + * + * (and pi_mutex 'obviously') + * + * p->pi_lock: + * + * p->pi_state_list -> pi_state->list, relation + * + * pi_state->refcount: + * + * pi_state lifetime + * + * + * Lock order: + * + * hb->lock + * pi_mutex->wait_lock + * p->pi_lock + * */ /* @@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr) * the pi_state against the user space value. If correct, attach to * it. */ -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, +static int attach_to_pi_state(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; + u32 uval2; + int ret; /* * Userspace might have messed up non-PI and PI futexes [3] @@ -991,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, if (unlikely(!pi_state)) return -EINVAL; + /* + * We get here with hb->lock held, and having found a + * futex_top_waiter(). This means that futex_lock_pi() of said futex_q + * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. + */ WARN_ON(!atomic_read(&pi_state->refcount)); /* + * Now that we have a pi_state, we can acquire wait_lock + * and do the state validation. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Since {uval, pi_state} is serialized by wait_lock, and our current + * uval was read without holding it, it can have changed. Verify it + * still is what we expect it to be, otherwise retry the entire + * operation. + */ + if (get_futex_value_locked(&uval2, uaddr)) + goto out_efault; + + if (uval != uval2) + goto out_eagain; + + /* * Handle the owner died case: */ if (uval & FUTEX_OWNER_DIED) { @@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * is not 0. Inconsistent state. [5] */ if (pid) - return -EINVAL; + goto out_einval; /* * Take a ref on the state and return success. [4] */ - goto out_state; + goto out_attach; } /* @@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * Take a ref on the state and return success. [6] */ if (!pid) - goto out_state; + goto out_attach; } else { /* * If the owner died bit is not set, then the pi_state * must have an owner. [7] */ if (!pi_state->owner) - return -EINVAL; + goto out_einval; } /* @@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * user space TID. [9/10] */ if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; -out_state: - atomic_inc(&pi_state->refcount); + goto out_einval; + +out_attach: + get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); *ps = pi_state; return 0; + +out_einval: + ret = -EINVAL; + goto out_error; + +out_eagain: + ret = -EAGAIN; + goto out_error; + +out_efault: + ret = -EFAULT; + goto out_error; + +out_error: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } /* @@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. */ pi_state = alloc_pi_state(); @@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, +static int lookup_pi_state(u32 __user *uaddr, u32 uval, + struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps) { - struct futex_q *match = futex_top_waiter(hb, key); + struct futex_q *top_waiter = futex_top_waiter(hb, key); /* * If there is a waiter on that futex, validate it and * attach to the pi_state when the validation succeeds. */ - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * We are the first waiter - try to look up the owner based on @@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; - /*If user space value changed, let the caller retry */ + /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; } @@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { u32 uval, newval, vpid = task_pid_vnr(task); - struct futex_q *match; + struct futex_q *top_waiter; int ret; /* @@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * Lookup existing state first. If it exists, try to attach to * its pi_state. */ - match = futex_top_waiter(hb, key); - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * No waiter and user TID is 0. We are here because the @@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) wake_q_add(wake_q, p); __unqueue_futex(q); /* - * The waiting task can free the futex_q as soon as - * q->lock_ptr = NULL is written, without taking any locks. A - * memory barrier is required here to prevent the following - * store to lock_ptr from getting ahead of the plist_del. + * The waiting task can free the futex_q as soon as q->lock_ptr = NULL + * is written, without taking any locks. This is possible in the event + * of a spurious wakeup, for example. A memory barrier is required here + * to prevent the following store to lock_ptr from getting ahead of the + * plist_del in __unqueue_futex(). */ - smp_wmb(); - q->lock_ptr = NULL; + smp_store_release(&q->lock_ptr, NULL); } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, - struct futex_hash_bucket *hb) +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + struct task_struct *new_owner; + bool postunlock = false; DEFINE_WAKE_Q(wake_q); - bool deboost; int ret = 0; - if (!pi_state) - return -EINVAL; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - return -EINVAL; - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!new_owner)) { + /* + * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; + } /* - * It is possible that the next waiter (the one that brought - * this owner to the kernel) timed out and is no longer - * waiting on the lock. - */ - if (!new_owner) - new_owner = this->task; - - /* - * We pass it to the next owner. The WAITERS bit is always - * kept enabled while there is PI state around. We cleanup the - * owner died bit, because we are the owner. + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); @@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; + } else if (curval != uval) { /* * If a unconditional UNLOCK_PI operation (user space did not @@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else ret = -EINVAL; } - if (ret) { - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return ret; - } + + if (ret) + goto out_unlock; + + /* + * This is a point of no return; once we modify the uval there is no + * going back and subsequent operations must not fail. + */ raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, pi_state->owner = new_owner; raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); - deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - /* - * First unlock HB so the waiter does not spin on it once he got woken - * up. Second wake up the waiter before the priority is adjusted. If we - * deboost first (and lose our higher priority), then the task might get - * scheduled away before the wake up can take place. - */ - spin_unlock(&hb->lock); - wake_up_q(&wake_q); - if (deboost) - rt_mutex_adjust_prio(current); + if (postunlock) + rt_mutex_postunlock(&wake_q); - return 0; + return ret; } /* @@ -1826,7 +1913,7 @@ retry_private: * If that call succeeds then we have pi_state and an * initial refcount on it. */ - ret = lookup_pi_state(ret, hb2, &key2, &pi_state); + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); } switch (ret) { @@ -1909,7 +1996,7 @@ retry_private: * refcount on the pi_state and store the pointer in * the futex_q object of the waiter. */ - atomic_inc(&pi_state->refcount); + get_pi_state(pi_state); this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, @@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb) hb_waiters_dec(hb); } -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __queue_me(q, hb); spin_unlock(&hb->lock); } @@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner = pi_state->owner; u32 uval, uninitialized_var(curval), newval; + struct task_struct *oldowner; int ret; + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + oldowner = pi_state->owner; /* Owner died? */ if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; @@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the * previous highest priority waiter or we are the highest priority - * waiter but failed to get the rtmutex the first time. + * waiter but have failed to get the rtmutex the first time. + * * We have to replace the newowner TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * @@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * because we can fault here. Imagine swapped out pages or a fork * that marked all the anonymous memory readonly for cow. * - * Modifying pi_state _before_ the user space value would - * leave the pi_state in an inconsistent state when we fault - * here, because we need to drop the hash bucket lock to - * handle the fault. This might be observed in the PID check - * in lookup_pi_state. + * Modifying pi_state _before_ the user space value would leave the + * pi_state in an inconsistent state when we fault here, because we + * need to drop the locks to handle the fault. This might be observed + * in the PID check in lookup_pi_state. */ retry: if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; - while (1) { + for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) @@ -2169,47 +2264,60 @@ retry: * itself. */ if (pi_state->owner != NULL) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - raw_spin_lock_irq(&newowner->pi_lock); + raw_spin_lock(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock(&newowner->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return 0; /* - * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the highest priority - * waiter itself or the task which stole the rtmutex) the - * chance to try the fixup of the pi_state. So once we are - * back from handling the fault we need to check the pi_state - * after reacquiring the hash bucket lock and before trying to - * do another fixup. When the fixup has been done already we - * simply return. + * To handle the page fault we need to drop the locks here. That gives + * the other task (either the highest priority waiter itself or the + * task which stole the rtmutex) the chance to try the fixup of the + * pi_state. So once we are back from handling the fault we need to + * check the pi_state after reacquiring the locks and before trying to + * do another fixup. When the fixup has been done already we simply + * return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. */ handle_fault: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) - return 0; + if (pi_state->owner != oldowner) { + ret = 0; + goto out_unlock; + } if (ret) - return ret; + goto out_unlock; goto retry; + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } static long futex_wait_restart(struct restart_block *restart); @@ -2231,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - struct task_struct *owner; int ret = 0; if (locked) { /* * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: + * + * We can safely read pi_state->owner without holding wait_lock + * because we now own the rt_mutex, only the owner will attempt + * to change it. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); @@ -2245,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) } /* - * Catch the rare case, where the lock was released when we were on the - * way back before we locked the hash bucket. - */ - if (q->pi_state->owner == current) { - /* - * Try to get the rt_mutex now. This might fail as some other - * task acquired the rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - - /* - * pi_state is incorrect, some other task did a lock steal and - * we returned due to timeout or signal without taking the - * rt_mutex. Too late. - */ - raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); - goto out; - } - - /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " "pi-state %p\n", ret, q->pi_state->pi_mutex.owner, q->pi_state->owner); + } out: return ret ? ret : locked; @@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; int res, ret; @@ -2557,25 +2642,68 @@ retry_private: } } + WARN_ON(!q.pi_state); + /* * Only actually queue now that the atomic ops are done: */ - queue_me(&q, hb); + __queue_me(&q, hb); - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) { - ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); - } else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; } + rt_mutex_init_waiter(&rt_waiter); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling rt_mutex_start_proxy_lock(). This still fully + * serializes against futex_unlock_pi() as that does the exact same + * lock handoff sequence. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + + if (ret) { + if (ret == 1) + ret = 0; + + spin_lock(q.lock_ptr); + goto no_block; + } + + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + spin_lock(q.lock_ptr); /* + * If we failed to acquire the lock (signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: + /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ @@ -2591,12 +2719,19 @@ retry_private: * If fixup_owner() faulted and was unable to handle the fault, unlock * it and return the fault to userspace. */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock */ unqueue_me_pi(&q); + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + goto out_put_key; out_unlock_put_key: @@ -2605,8 +2740,10 @@ out_unlock_put_key: out_put_key: put_futex_key(&q.key); out: - if (to) + if (to) { + hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); + } return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: @@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; - struct futex_q *match; + struct futex_q *top_waiter; int ret; retry: @@ -2657,12 +2794,37 @@ retry: * all and we at least want to know if user space fiddled * with the futex value instead of blindly unlocking. */ - match = futex_top_waiter(hb, &key); - if (match) { - ret = wake_futex_pi(uaddr, uval, match, hb); + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + /* - * In case of success wake_futex_pi dropped the hash - * bucket lock. + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + get_pi_state(pi_state); + /* + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. */ if (!ret) goto out_putkey; @@ -2677,7 +2839,6 @@ retry: * setting the FUTEX_WAITERS bit. Try again. */ if (ret == -EAGAIN) { - spin_unlock(&hb->lock); put_futex_key(&key); goto retry; } @@ -2685,7 +2846,7 @@ retry: * wake_futex_pi has detected invalid state. Tell user * space. */ - goto out_unlock; + goto out_putkey; } /* @@ -2695,8 +2856,10 @@ retry: * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + spin_unlock(&hb->lock); goto pi_faulted; + } /* * If uval has changed, let user space handle it. @@ -2710,7 +2873,6 @@ out_putkey: return ret; pi_faulted: - spin_unlock(&hb->lock); put_futex_key(&key); ret = fault_in_user_writeable(uaddr); @@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; @@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ - debug_rt_mutex_init_waiter(&rt_waiter); - RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); - RB_CLEAR_NODE(&rt_waiter.tree_entry); - rt_waiter.task = NULL; + rt_mutex_init_waiter(&rt_waiter); ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); - debug_rt_mutex_free_waiter(&rt_waiter); + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); spin_lock(q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. @@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * the fault, unlock the rt_mutex and return the fault to * userspace. */ - if (ret && rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4544b115f5eb..e2d356dd7581 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec; + int n, nodes, cpus_per_vec, extra_vecs, curvec; int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; @@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) goto done; } - /* Spread the vectors per node */ - vecs_per_node = affv / nodes; - /* Account for rounding errors */ - extra_vecs = affv - (nodes * vecs_per_node); - for_each_node_mask(n, nodemsk) { - int ncpus, v, vecs_to_assign = vecs_per_node; + int ncpus, v, vecs_to_assign, vecs_per_node; + + /* Spread the vectors per node */ + vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); + vecs_to_assign = min(vecs_per_node, ncpus); + + /* Account for rounding errors */ + extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); for (v = 0; curvec < last_affv && v < vecs_to_assign; curvec++, v++) { @@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Account for extra vectors to compensate rounding errors */ if (extra_vecs) { cpus_per_vec++; - if (!--extra_vecs) - vecs_per_node++; + --extra_vecs; } irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } if (curvec >= last_affv) break; + --nodes; } done: diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index be3c34e4f2ac..686be4b73018 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq) irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); - action_ret = action->thread_fn(action->irq, action->dev_id); + action_ret = IRQ_NONE; + for_each_action_of_desc(desc, action) + action_ret |= action->thread_fn(action->irq, action->dev_id); + if (!noirqdebug) note_interrupt(desc, action_ret); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a4afe5cc5af1..ae1c90f20381 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -852,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (desc->irq_common_data.affinity) + if (cpumask_available(desc->irq_common_data.affinity)) cpumask_copy(mask, desc->irq_common_data.affinity); else valid = false; @@ -1212,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * set the trigger type must match. Also all must * agree on ONESHOT. */ + unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data); + if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + (oldtype != (new->flags & IRQF_TRIGGER_MASK)) || ((old->flags ^ new->flags) & IRQF_ONESHOT)) goto mismatch; diff --git a/kernel/kthread.c b/kernel/kthread.c index 2f26adea0f84..26db528c1d88 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/cgroup.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -225,6 +226,7 @@ static int kthread(void *_create) ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { + cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } @@ -538,6 +540,7 @@ int kthreadd(void *unused) set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; + cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a95e5d1f4a9c..98dd6231d43b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -660,6 +660,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + bool is_static = false; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); @@ -673,10 +674,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) /* * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: + * is the lock object itself. If the lock is in the per cpu area, + * the canonical address of the lock (per cpu offset removed) is + * used. */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; + if (unlikely(!lock->key)) { + unsigned long can_addr, addr = (unsigned long)lock; + + if (__is_kernel_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (__is_module_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (static_obj(lock)) + lock->key = (void *)lock; + else + return ERR_PTR(-EINVAL); + is_static = true; + } /* * NOTE: the class-key must be unique. For dynamic locks, a static @@ -708,7 +722,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } } - return NULL; + return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } /* @@ -726,19 +740,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); - if (likely(class)) + if (likely(!IS_ERR_OR_NULL(class))) goto out_set_class_cache; /* * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { + */ + if (IS_ERR(class)) { debug_locks_off(); printk("INFO: trying to register non-static key.\n"); printk("the code is fine but needs lockdep annotation.\n"); printk("turning off the locking correctness validator.\n"); dump_stack(); - return NULL; } @@ -3419,7 +3432,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) * Clearly if the lock hasn't been acquired _ever_, we're not * holding it either, so report failure. */ - if (!class) + if (IS_ERR_OR_NULL(class)) return 0; /* @@ -3437,13 +3450,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 0; } +/* @depth must not be zero */ +static struct held_lock *find_held_lock(struct task_struct *curr, + struct lockdep_map *lock, + unsigned int depth, int *idx) +{ + struct held_lock *ret, *hlock, *prev_hlock; + int i; + + i = depth - 1; + hlock = curr->held_locks + i; + ret = hlock; + if (match_held_lock(hlock, lock)) + goto out; + + ret = NULL; + for (i--, prev_hlock = hlock--; + i >= 0; + i--, prev_hlock = hlock--) { + /* + * We must not cross into another context: + */ + if (prev_hlock->irq_context != hlock->irq_context) { + ret = NULL; + break; + } + if (match_held_lock(hlock, lock)) { + ret = hlock; + break; + } + } + +out: + *idx = i; + return ret; +} + +static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, + int idx) +{ + struct held_lock *hlock; + + for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, + hlock->trylock, + hlock->read, hlock->check, + hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip, + hlock->references, hlock->pin_count)) + return 1; + } + return 0; +} + static int __lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class *class; unsigned int depth; int i; @@ -3456,21 +3523,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); -found_it: lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes + 1; @@ -3478,15 +3534,46 @@ found_it: curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - for (; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) - return 0; - } + if (reacquire_held_locks(curr, depth, i)) + return 0; + + /* + * I took it apart and put it back together again, except now I have + * these 'spare' parts.. where shall I put them. + */ + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + +static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + /* + * This function is about (re)setting the class of a held lock, + * yet we're not actually holding any locks. Naughty user! + */ + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + WARN(hlock->read, "downgrading a read lock"); + hlock->read = 1; + hlock->acquire_ip = ip; + + if (reacquire_held_locks(curr, depth, i)) + return 0; /* * I took it apart and put it back together again, except now I have @@ -3508,7 +3595,7 @@ static int __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; unsigned int depth; int i; @@ -3527,21 +3614,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * Check whether the lock exists in the current stack * of held locks: */ - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); -found_it: if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -3568,15 +3644,8 @@ found_it: curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - for (i++; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) - return 0; - } + if (reacquire_held_locks(curr, depth, i + 1)) + return 0; /* * We had N bottles of beer on the wall, we drank one, but now @@ -3741,6 +3810,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); +void lock_downgrade(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_downgrade(lock, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_downgrade); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -3903,7 +3989,7 @@ static void __lock_contended(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class_stats *stats; unsigned int depth; int i, contention_point, contending_point; @@ -3916,22 +4002,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) { + print_lock_contention_bug(curr, lock, ip); + return; } - print_lock_contention_bug(curr, lock, ip); - return; -found_it: if (hlock->instance != lock) return; @@ -3955,7 +4031,7 @@ static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class_stats *stats; unsigned int depth; u64 now, waittime = 0; @@ -3969,22 +4045,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) { + print_lock_contention_bug(curr, lock, _RET_IP_); + return; } - print_lock_contention_bug(curr, lock, _RET_IP_); - return; -found_it: if (hlock->instance != lock) return; @@ -4172,7 +4238,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); - if (class) + if (!IS_ERR_OR_NULL(class)) zap_class(class); } /* diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c2b88490d857..c08fbd2f5ba9 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -46,13 +46,13 @@ enum { (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) /* - * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, * .data and .bss to fit in required 32MB limit for the kernel. With - * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems. * So, reduce the static allocations for lockdeps related structures so that * everything fits in current required size limit. */ -#ifdef CONFIG_PROVE_LOCKING_SMALL +#ifdef CONFIG_LOCKDEP_SMALL /* * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies * we track. diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 97ee9df32e0f..32fe775a2eaf 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -174,12 +174,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) lock->name = name; } -void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index d0519c3432b6..b585af9a1b50 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -9,9 +9,6 @@ * This file contains macros used solely by rtmutex.c. Debug version. */ -extern void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6edc32ecd9c5..b95509416909 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #endif +/* + * Only use with rt_mutex_waiter_{less,equal}() + */ +#define task_to_waiter(p) \ + &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + static inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, struct rt_mutex_waiter *right) @@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return dl_time_before(left->task->dl.deadline, - right->task->dl.deadline); + return dl_time_before(left->deadline, right->deadline); return 0; } +static inline int +rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) +{ + if (left->prio != right->prio) + return 0; + + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. + * If left waiter has a dl_prio(), and we didn't return 0 above, + * then right waiter has a dl_prio() too. + */ + if (dl_prio(left->prio)) + return left->deadline == right->deadline; + + return 1; +} + static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { @@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) RB_CLEAR_NODE(&waiter->pi_tree_entry); } -/* - * Calculate task priority from the waiter tree priority - * - * Return task->normal_prio when the waiter tree is empty or when - * the waiter is not allowed to do priority boosting - */ -int rt_mutex_getprio(struct task_struct *task) -{ - if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; - - return min(task_top_pi_waiter(task)->prio, - task->normal_prio); -} - -struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +static void rt_mutex_adjust_prio(struct task_struct *p) { - if (likely(!task_has_pi_waiters(task))) - return NULL; - - return task_top_pi_waiter(task)->task; -} + struct task_struct *pi_task = NULL; -/* - * Called by sched_setscheduler() to get the priority which will be - * effective after the change. - */ -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) -{ - if (!task_has_pi_waiters(task)) - return newprio; + lockdep_assert_held(&p->pi_lock); - if (task_top_pi_waiter(task)->task->prio <= newprio) - return task_top_pi_waiter(task)->task->prio; - return newprio; -} + if (task_has_pi_waiters(p)) + pi_task = task_top_pi_waiter(p)->task; -/* - * Adjust the priority of a task, after its pi_waiters got modified. - * - * This can be both boosting and unboosting. task->pi_lock must be held. - */ -static void __rt_mutex_adjust_prio(struct task_struct *task) -{ - int prio = rt_mutex_getprio(task); - - if (task->prio != prio || dl_prio(prio)) - rt_mutex_setprio(task, prio); -} - -/* - * Adjust task priority (undo boosting). Called from the exit path of - * rt_mutex_slowunlock() and rt_mutex_slowlock(). - * - * (Note: We do this outside of the protection of lock->wait_lock to - * allow the lock to be taken while or before we readjust the priority - * of task. We do not use the spin_xx_mutex() variants here as we are - * outside of the debug path.) - */ -void rt_mutex_adjust_prio(struct task_struct *task) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + rt_mutex_setprio(p, pi_task); } /* @@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (waiter->prio == task->prio) { + if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* [7] Requeue the waiter in the lock waiter tree. */ rt_mutex_dequeue(lock, waiter); + + /* + * Update the waiter prio fields now that we're dequeued. + * + * These values can have changed through either: + * + * sys_sched_set_scheduler() / sys_sched_setattr() + * + * or + * + * DL CBS enforcement advancing the effective deadline. + * + * Even though pi_waiters also uses these fields, and that tree is only + * updated in [11], we can do this here, since we hold [L], which + * serializes all pi_waiters access and rb_erase() does not care about + * the values of the node being removed. + */ waiter->prio = task->prio; + waiter->deadline = task->dl.deadline; + rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ @@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else if (prerequeue_top_waiter == waiter) { /* @@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else { /* * Nothing changed. No need to do any priority @@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { + lockdep_assert_held(&lock->wait_lock); + /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * the top waiter priority (kernel view), * @task lost. */ - if (task->prio >= rt_mutex_top_waiter(lock)->prio) + if (!rt_mutex_waiter_less(task_to_waiter(task), + rt_mutex_top_waiter(lock))) return 0; /* @@ -938,8 +928,6 @@ takeit: */ rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, task); - return 1; } @@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex *next_lock; int chain_walk = 0, res; + lockdep_assert_held(&lock->wait_lock); + /* * Early deadlock detection. We really don't want the task to * enqueue on itself just to untangle the mess later. It's not @@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; + waiter->deadline = task->dl.deadline; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, waiter = rt_mutex_top_waiter(lock); /* - * Remove it from current->pi_waiters. We do not adjust a - * possible priority boost right now. We execute wakeup in the - * boosted mode and go back to normal after releasing - * lock->wait_lock. + * Remove it from current->pi_waiters and deboost. + * + * We must in fact deboost here in order to ensure we call + * rt_mutex_setprio() to update p->pi_top_task before the + * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); + rt_mutex_adjust_prio(current); /* * As we are waking up the top waiter, and the waiter stays @@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock(¤t->pi_lock); - + /* + * We deboosted before waking the top waiter task such that we don't + * run two tasks with the 'same' priority (and ensure the + * p->pi_top_task pointer points to a blocked task). This however can + * lead to priority inversion if we would get preempted after the + * deboost but before waking our donor task, hence the preempt_disable() + * before unlock. + * + * Pairs with preempt_enable() in rt_mutex_postunlock(); + */ + preempt_disable(); wake_q_add(wake_q, waiter->task); + raw_spin_unlock(¤t->pi_lock); } /* @@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock, struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; @@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock, if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); @@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || (waiter->prio == task->prio && - !dl_prio(task->prio))) { + if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task) next_lock, NULL, task); } +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; +} + /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take @@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, unsigned long flags; int ret = 0; - debug_rt_mutex_init_waiter(&waiter); - RB_CLEAR_NODE(&waiter.pi_tree_entry); - RB_CLEAR_NODE(&waiter.tree_entry); + rt_mutex_init_waiter(&waiter); /* * Technically we could use raw_spin_[un]lock_irq() here, but this can @@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path to release a rt-mutex. - * Return whether the current task needs to undo a potential priority boosting. + * + * Return whether the current task needs to call rt_mutex_postunlock(). */ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) @@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, debug_rt_mutex_unlock(lock); - rt_mutex_deadlock_account_unlock(current); - /* * We must be careful here if the fast path is enabled. If we * have no waiters queued we cannot set owner to NULL here @@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * Queue the next waiter for wakeup once we release the wait_lock. */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - /* check PI boosting */ - return true; + return true; /* call rt_mutex_postunlock() */ } /* @@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } static inline int @@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, timeout, chwalk); + + return slowfn(lock, state, timeout, chwalk); } static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 1; - } + return slowfn(lock); } +/* + * Performs the wakeup of the the top-waiter and re-enables preemption. + */ +void rt_mutex_postunlock(struct wake_q_head *wake_q) +{ + wake_up_q(wake_q); + + /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ + preempt_enable(); +} + static inline void rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, @@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock, { DEFINE_WAKE_Q(wake_q); - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - - } else { - bool deboost = slowfn(lock, &wake_q); - - wake_up_q(&wake_q); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - /* Undo pi boosting if necessary: */ - if (deboost) - rt_mutex_adjust_prio(current); - } + if (slowfn(lock, &wake_q)) + rt_mutex_postunlock(&wake_q); } /** @@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /* - * Futex variant with full deadlock detection. + * Futex variant, must not use fastpath. */ -int rt_mutex_timed_futex_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *timeout) +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) { - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_FULL_CHAINWALK, - rt_mutex_slowlock); + return rt_mutex_slowtrylock(lock); } /** @@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock - * @lock: the rt_mutex to be unlocked - * - * Returns: true/false indicating whether priority adjustment is - * required or not. + * Futex variant, that since futex variants do not use the fast-path, can be + * simple and will not need to retry. */ -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh) +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q) { - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - return false; + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ } - return rt_mutex_slowunlock(lock, wqh); + + /* + * We've already deboosted, mark_wakeup_next_waiter() will + * retain preempt_disabled when we drop the wait_lock, to + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ + mark_wakeup_next_waiter(wake_q, lock); + + return true; /* call postunlock() */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +{ + DEFINE_WAKE_Q(wake_q); + bool postunlock; + + raw_spin_lock_irq(&lock->wait_lock); + postunlock = __rt_mutex_futex_unlock(lock, &wake_q); + raw_spin_unlock_irq(&lock->wait_lock); + + if (postunlock) + rt_mutex_postunlock(&wake_q); } /** @@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); } /** @@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); - rt_mutex_deadlock_account_unlock(proxy_owner); } -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; - raw_spin_lock_irq(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock_irq(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; - } /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, @@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); - debug_rt_mutex_print_deadlock(waiter); return ret; } /** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** * rt_mutex_next_owner - return the next owner of the lock * * @lock: the rt lock query @@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) } /** - * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition * @lock: the rt_mutex we were woken on * @to: the timeout, null if none. hrtimer should already have * been started. * @waiter: the pre-initialized rt_mutex_waiter * - * Complete the lock acquisition started our behalf by another thread. + * Wait for the the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). * * Returns: * 0 - success * <0 - error, one of -EINTR, -ETIMEDOUT * - * Special API call for PI-futex requeue support + * Special API call for PI-futex support */ -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter) { @@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - if (unlikely(ret)) + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); + fixup_rt_mutex_waiters(lock); + cleanup = true; + } /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might @@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock_irq(&lock->wait_lock); - return ret; + return cleanup; } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index c4060584c407..6607802efa8b 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -11,8 +11,6 @@ */ #define rt_mutex_deadlock_check(l) (0) -#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) -#define rt_mutex_deadlock_account_unlock(l) do { } while (0) #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) #define debug_rt_mutex_lock(l) do { } while (0) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 856dfff5c33a..72ad45a9a794 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -34,6 +34,7 @@ struct rt_mutex_waiter { struct rt_mutex *deadlock_lock; #endif int prio; + u64 deadline; }; /* @@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); -extern void rt_mutex_adjust_prio(struct task_struct *task); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); + +extern int rt_mutex_futex_trylock(struct rt_mutex *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh); + +extern void rt_mutex_postunlock(struct wake_q_head *wake_q); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 90a74ccd85a4..4d48b1c4870d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write); */ void downgrade_write(struct rw_semaphore *sem) { - /* - * lockdep: a downgraded write will live on as a write - * dependency. - */ + lock_downgrade(&sem->dep_map, _RET_IP_); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 6b7abb334ca6..39f56c870051 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus) struct stress { struct work_struct work; struct ww_mutex *locks; + unsigned long timeout; int nlocks; - int nloops; }; static int *get_random_order(int count) @@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work) if (!order) return; - ww_acquire_init(&ctx, &ww_class); - do { int contended = -1; int n, err; + ww_acquire_init(&ctx, &ww_class); retry: err = 0; for (n = 0; n < nlocks; n++) { @@ -433,9 +432,9 @@ retry: __func__, err); break; } - } while (--stress->nloops); - ww_acquire_fini(&ctx); + ww_acquire_fini(&ctx); + } while (!time_after(jiffies, stress->timeout)); kfree(order); kfree(stress); @@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work) kfree(order); order = NULL; - ww_acquire_init(&ctx, &ww_class); - do { + ww_acquire_init(&ctx, &ww_class); + list_for_each_entry(ll, &locks, link) { err = ww_mutex_lock(ll->lock, &ctx); if (!err) @@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work) dummy_load(stress); list_for_each_entry(ll, &locks, link) ww_mutex_unlock(ll->lock); - } while (--stress->nloops); - ww_acquire_fini(&ctx); + ww_acquire_fini(&ctx); + } while (!time_after(jiffies, stress->timeout)); out: list_for_each_entry_safe(ll, ln, &locks, link) @@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work) __func__, err); break; } - } while (--stress->nloops); + } while (!time_after(jiffies, stress->timeout)); kfree(stress); } @@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work) #define STRESS_ONE BIT(2) #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) -static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) +static int stress(int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; int n; @@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) INIT_WORK(&stress->work, fn); stress->locks = locks; stress->nlocks = nlocks; - stress->nloops = nloops; + stress->timeout = jiffies + 2*HZ; queue_work(wq, &stress->work); nthreads--; @@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER); + ret = stress(16, 2*ncpus, STRESS_INORDER); if (ret) return ret; - ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER); + ret = stress(16, 2*ncpus, STRESS_REORDER); if (ret) return ret; - ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); + ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; diff --git a/kernel/module.c b/kernel/module.c index 7eba6dea4f41..6d9988031c5b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -665,16 +665,7 @@ static void percpu_modcopy(struct module *mod, memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); } -/** - * is_module_percpu_address - test whether address is from module static percpu - * @addr: address to test - * - * Test whether @addr belongs to module static percpu area. - * - * RETURNS: - * %true if @addr is from module static percpu area - */ -bool is_module_percpu_address(unsigned long addr) +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) { struct module *mod; unsigned int cpu; @@ -688,9 +679,15 @@ bool is_module_percpu_address(unsigned long addr) continue; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(mod->percpu, cpu); - - if ((void *)addr >= start && - (void *)addr < start + mod->percpu_size) { + void *va = (void *)addr; + + if (va >= start && va < start + mod->percpu_size) { + if (can_addr) { + *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(mod->percpu, + get_boot_cpu_id()); + } preempt_enable(); return true; } @@ -701,6 +698,20 @@ bool is_module_percpu_address(unsigned long addr) return false; } +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + return __is_module_percpu_address(addr, NULL); +} + #else /* ... !CONFIG_SMP */ static inline void __percpu *mod_percpu(struct module *mod) @@ -732,6 +743,11 @@ bool is_module_percpu_address(unsigned long addr) return false; } +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + return false; +} + #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b31fc05a0f1..c51147a1204c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000; cpumask_var_t cpu_isolated_map; /* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - raw_spin_lock(&rq->lock); - - return rq; -} - -/* * __task_rq_lock - lock the rq @p resides on. */ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) @@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq) return; #ifdef CONFIG_SCHED_DEBUG + if (sched_feat(WARN_DOUBLE_CLOCK)) + SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; #endif + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; if (delta < 0) return; @@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq) static enum hrtimer_restart hrtick(struct hrtimer *timer) { struct rq *rq = container_of(timer, struct rq, hrtick_timer); + struct rq_flags rf; WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); return HRTIMER_NORESTART; } @@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq) static void __hrtick_start(void *arg) { struct rq *rq = arg; + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); __hrtick_restart(rq); rq->hrtick_csd_pending = 0; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p) static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & ENQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); + p->sched_class->enqueue_task(rq, p, flags); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & DEQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(rq, p); + p->sched_class->dequeue_task(rq, p, flags); } @@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * * Returns (locked) new rq. Old rq's lock is released. */ -static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int new_cpu) { lockdep_assert_held(&rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = cpu_rq(new_cpu); - raw_spin_lock(&rq->lock); + rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); enqueue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; @@ -980,7 +977,8 @@ struct migration_arg { * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ -static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) +static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int dest_cpu) { if (unlikely(!cpu_active(dest_cpu))) return rq; @@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) return rq; - rq = move_queued_task(rq, p, dest_cpu); + update_rq_clock(rq); + rq = move_queued_task(rq, rf, p, dest_cpu); return rq; } @@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + struct rq_flags rf; /* * The original target CPU might have gone down and we might @@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data) sched_ttwu_pending(); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data) */ if (task_rq(p) == rq) { if (task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else p->wake_cpu = arg->dest_cpu; } - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); raw_spin_unlock(&p->pi_lock); local_irq_enable(); @@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) put_prev_task(rq, p); @@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); } @@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - rq_unpin_lock(rq, &rf); - rq = move_queued_task(rq, p, dest_cpu); - rq_repin_lock(rq, &rf); + rq = move_queued_task(rq, &rf, p, dest_cpu); } out: task_rq_unlock(rq, p, &rf); @@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; + struct rq_flags srf, drf; src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + rq_pin_lock(src_rq, &srf); + rq_pin_lock(dst_rq, &drf); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); + + rq_unpin_lock(dst_rq, &drf); + rq_unpin_lock(src_rq, &srf); + } else { /* * Task isn't running anymore; make it appear like we migrated @@ -1680,7 +1686,7 @@ static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { - int en_flags = ENQUEUE_WAKEUP; + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; lockdep_assert_held(&rq->lock); @@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void) struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; - unsigned long flags; struct rq_flags rf; if (!llist) return; - raw_spin_lock_irqsave(&rq->lock, flags); - rq_pin_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); while (llist) { int wake_flags = 0; @@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void) ttwu_do_activate(rq, p, wake_flags, &rf); } - rq_unpin_lock(rq, &rf); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } void scheduler_ipi(void) @@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; rcu_read_lock(); @@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu) if (set_nr_if_polling(rq->idle)) { trace_sched_wake_idle_without_ipi(cpu); } else { - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (is_idle_task(rq->curr)) smp_send_reschedule(cpu); /* Else CPU is not idle, do nothing here: */ - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } out: @@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } #endif - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, &rf); + rq_lock(rq, &rf); + update_rq_clock(rq); ttwu_do_activate(rq, p, wake_flags, &rf); - rq_unpin_lock(rq, &rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); + rq_relock(rq, rf); } if (!(p->state & TASK_NORMAL)) @@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) delayacct_blkio_end(); atomic_dec(&rq->nr_iowait); } - ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); } ttwu_do_wakeup(rq, p, 0, rf); @@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(&p->se); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3093,15 +3094,18 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + struct rq_flags rf; sched_clock_tick(); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); + update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); - raw_spin_unlock(&rq->lock); + + rq_unlock(rq, &rf); perf_event_task_tick(); @@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, &rf); + rq_lock(rq, &rf); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; + update_rq_clock(rq); switch_count = &prev->nivcsw; if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { - deactivate_task(rq, prev, DEQUEUE_SLEEP); + deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); prev->on_rq = 0; if (prev->in_iowait) { @@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nvcsw; } - if (task_on_rq_queued(prev)) - update_rq_clock(rq); - next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt) rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } balance_callback(rq); @@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function); #ifdef CONFIG_RT_MUTEXES +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + /* * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) + * @p: task to boost + * @pi_task: donor task * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). @@ -3682,17 +3697,42 @@ EXPORT_SYMBOL(default_wake_function); * Used by the rt_mutex code to implement priority inheritance * logic. Call site only calls if the priority of the task changed. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; + int prio, oldprio, queued, running, queue_flag = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; struct rq *rq; - BUG_ON(prio > MAX_PRIO); + /* XXX used to be waiter->prio, not waiter->task->prio */ + prio = __rt_effective_prio(pi_task, p->normal_prio); + + /* + * If nothing changed; bail early. + */ + if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) + return; rq = __task_rq_lock(p, &rf); update_rq_clock(rq); + /* + * Set under pi_lock && rq->lock, such that the value can be used under + * either lock. + * + * Note that there is loads of tricky to make this pointer cache work + * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to + * ensure a task is de-boosted (pi_task is set to NULL) before the + * task is allowed to run again (and can exit). This ensures the pointer + * points to a blocked task -- which guaratees the task is present. + */ + p->pi_top_task = pi_task; + + /* + * For FIFO/RR we only need to set prio, if that matches we're done. + */ + if (prio == p->prio && !dl_prio(prio)) + goto out_unlock; /* * Idle task boosting is a nono in general. There is one @@ -3712,7 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) goto out_unlock; } - trace_sched_pi_setprio(p, prio); + trace_sched_pi_setprio(p, pi_task); oldprio = p->prio; if (oldprio == prio) @@ -3736,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; @@ -3774,6 +3813,11 @@ out_unlock: balance_callback(rq); preempt_enable(); } +#else +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} #endif void set_user_nice(struct task_struct *p, long nice) @@ -3805,7 +3849,7 @@ void set_user_nice(struct task_struct *p, long nice) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) put_prev_task(rq, p); @@ -3816,7 +3860,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4020,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * Keep a potential priority boosting if called from * sched_setscheduler(). */ + p->prio = normal_prio(p); if (keep_boost) - p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); - else - p->prio = normal_prio(p); + p->prio = rt_effective_prio(p, p->prio); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -4126,7 +4169,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq_flags rf; int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; /* May grab non-irq protected spin_locks: */ @@ -4310,7 +4353,7 @@ change: * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + new_effective_prio = rt_effective_prio(p, newprio); if (new_effective_prio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } @@ -4923,7 +4966,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, */ SYSCALL_DEFINE0(sched_yield) { - struct rq *rq = this_rq_lock(); + struct rq_flags rf; + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, &rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -4932,9 +4980,8 @@ SYSCALL_DEFINE0(sched_yield) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&rq->lock); + preempt_disable(); + rq_unlock(rq, &rf); sched_preempt_enable_no_resched(); schedule(); @@ -5514,7 +5561,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); @@ -5579,11 +5626,11 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct rq_flags rf; + struct rq_flags orf = *rf; int dest_cpu; /* @@ -5602,9 +5649,7 @@ static void migrate_tasks(struct rq *dead_rq) * class method both need to have an up-to-date * value of rq->clock[_task] */ - rq_pin_lock(rq, &rf); update_rq_clock(rq); - rq_unpin_lock(rq, &rf); for (;;) { /* @@ -5617,8 +5662,7 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task() assumes pinned rq->lock: */ - rq_repin_lock(rq, &rf); - next = pick_next_task(rq, &fake_task, &rf); + next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5631,10 +5675,9 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - rq_unpin_lock(rq, &rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); - raw_spin_lock(&rq->lock); + rq_relock(rq, rf); /* * Since we're inside stop-machine, _nothing_ should have @@ -5648,12 +5691,12 @@ static void migrate_tasks(struct rq *dead_rq) /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); - - rq = __migrate_task(rq, next, dest_cpu); + rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = dead_rq; - raw_spin_lock(&rq->lock); + *rf = orf; + rq_relock(rq, rf); } raw_spin_unlock(&next->pi_lock); } @@ -5732,7 +5775,7 @@ static void cpuset_cpu_active(void) * cpuset configurations. */ } - cpuset_update_active_cpus(true); + cpuset_update_active_cpus(); } static int cpuset_cpu_inactive(unsigned int cpu) @@ -5755,7 +5798,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) if (overflow) return -EBUSY; - cpuset_update_active_cpus(false); + cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); @@ -5766,7 +5809,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; set_cpu_active(cpu, true); @@ -5784,12 +5827,12 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); update_max_interval(); @@ -5847,18 +5890,20 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); - raw_spin_lock_irqsave(&rq->lock, flags); + + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, &rf); BUG_ON(rq->nr_running != 1); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(cpu); @@ -6412,7 +6457,8 @@ static void sched_change_group(struct task_struct *tsk, int type) */ void sched_move_task(struct task_struct *tsk) { - int queued, running; + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq_flags rf; struct rq *rq; @@ -6423,14 +6469,14 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + dequeue_task(rq, tsk, queue_flags); if (running) put_prev_task(rq, tsk); sched_change_group(tsk, TASK_MOVE_GROUP); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); + enqueue_task(rq, tsk, queue_flags); if (running) set_curr_task(rq, tsk); @@ -7008,14 +7054,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; + struct rq_flags rf; - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 54c577578da6..76877a62b5fa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -61,6 +61,11 @@ struct sugov_cpu { unsigned long util; unsigned long max; unsigned int flags; + + /* The field below is for single-CPU policies only. */ +#ifdef CONFIG_NO_HZ_COMMON + unsigned long saved_idle_calls; +#endif }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -93,22 +98,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, { struct cpufreq_policy *policy = sg_policy->policy; + if (sg_policy->next_freq == next_freq) + return; + + if (sg_policy->next_freq > next_freq) + next_freq = (sg_policy->next_freq + next_freq) >> 1; + + sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; if (policy->fast_switch_enabled) { - if (sg_policy->next_freq == next_freq) { - trace_cpu_frequency(policy->cur, smp_processor_id()); - return; - } - sg_policy->next_freq = next_freq; next_freq = cpufreq_driver_fast_switch(policy, next_freq); if (next_freq == CPUFREQ_ENTRY_INVALID) return; policy->cur = next_freq; trace_cpu_frequency(next_freq, smp_processor_id()); - } else if (sg_policy->next_freq != next_freq) { - sg_policy->next_freq = next_freq; + } else { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -192,6 +198,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, sg_cpu->iowait_boost >>= 1; } +#ifdef CONFIG_NO_HZ_COMMON +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +{ + unsigned long idle_calls = tick_nohz_get_idle_calls(); + bool ret = idle_calls == sg_cpu->saved_idle_calls; + + sg_cpu->saved_idle_calls = idle_calls; + return ret; +} +#else +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +#endif /* CONFIG_NO_HZ_COMMON */ + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { @@ -200,6 +219,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; + bool busy; sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -207,40 +227,37 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; + busy = sugov_cpu_is_busy(sg_cpu); + if (flags & SCHED_CPUFREQ_RT_DL) { next_f = policy->cpuinfo.max_freq; } else { sugov_get_util(&util, &max); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) + next_f = sg_policy->next_freq; } sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, - unsigned long util, unsigned long max, - unsigned int flags) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned long util = 0, max = 1; unsigned int j; - if (flags & SCHED_CPUFREQ_RT_DL) - return max_f; - - sugov_iowait_boost(sg_cpu, &util, &max); - for_each_cpu(j, policy->cpus) { - struct sugov_cpu *j_sg_cpu; + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; s64 delta_ns; - if (j == smp_processor_id()) - continue; - - j_sg_cpu = &per_cpu(sugov_cpu, j); /* * If the CPU utilization was last updated before the previous * frequency update and the time elapsed between the last update @@ -254,7 +271,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) - return max_f; + return policy->cpuinfo.max_freq; j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; @@ -289,7 +306,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); + if (flags & SCHED_CPUFREQ_RT_DL) + next_f = sg_policy->policy->cpuinfo.max_freq; + else + next_f = sugov_next_freq_shared(sg_cpu); + sugov_update_commit(sg_policy, time, next_f); } @@ -473,7 +494,6 @@ static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; struct sugov_tunables *tunables; - unsigned int lat; int ret = 0; /* State should be equivalent to EXIT */ @@ -512,10 +532,16 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - tunables->rate_limit_us = LATENCY_MULTIPLIER; - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) - tunables->rate_limit_us *= lat; + if (policy->transition_delay_us) { + tunables->rate_limit_us = policy->transition_delay_us; + } else { + unsigned int lat; + + tunables->rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) + tunables->rate_limit_us *= lat; + } policy->governor_data = sg_policy; sg_policy->tunables = tunables; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f3778e2b46c8..aea3135c5d90 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } +static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, + enum cpu_usage_stat idx) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + u64_stats_update_begin(&irqtime->sync); + cpustat[idx] += delta; + irqtime->total += delta; + irqtime->tick_delta += delta; + u64_stats_update_end(&irqtime->sync); +} + /* * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. @@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void) void irqtime_account_irq(struct task_struct *curr) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); - u64 *cpustat = kcpustat_this_cpu->cpustat; s64 delta; int cpu; @@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr) delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; - u64_stats_update_begin(&irqtime->sync); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) { - cpustat[CPUTIME_IRQ] += delta; - irqtime->tick_delta += delta; - } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { - cpustat[CPUTIME_SOFTIRQ] += delta; - irqtime->tick_delta += delta; - } - - u64_stats_update_end(&irqtime->sync); + if (hardirq_count()) + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dea138964b91..a903276fcb62 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP + +#include "sched-pelt.h" + static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are - * dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ - /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) { @@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ -static const u32 runnable_avg_yN_inv[] = { - 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, - 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, - 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, - 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, - 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, - 0x85aac367, 0x82cd8698, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent - * over-estimates when re-combining. - */ -static const u32 runnable_avg_yN_sum[] = { - 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, - 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, - 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to - * lower integers. See Documentation/scheduler/sched-avg.txt how these - * were generated: - */ -static const u32 __accumulated_sum_N32[] = { - 0, 23371, 35056, 40899, 43820, 45281, - 46011, 46376, 46559, 46650, 46696, 46719, -}; - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; - if (!n) - return val; - else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + if (unlikely(n > LOAD_AVG_PERIOD * 63)) return 0; /* after bounds checking we can collapse to 32-bit */ @@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n) return val; } +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + * = u y^p + (Step 1) + * + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 */ -static u32 __compute_runnable_contrib(u64 n) +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u32 contrib = 0; + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; - if (likely(n <= LOAD_AVG_PERIOD)) - return runnable_avg_yN_sum[n]; - else if (unlikely(n >= LOAD_AVG_MAX_N)) - return LOAD_AVG_MAX; + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; -} + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} /* * We can represent the historical contribution to runnable average as the @@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -__update_load_avg(u64 now, int cpu, struct sched_avg *sa, +___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, delta >>= 10; if (!delta) return 0; - sa->last_update_time = now; - - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; + sa->last_update_time += delta << 10; - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; + return 1; +} - sa->period_contrib += delta; +static int +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); +} - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - } +static int +__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); +} - return decayed; +static int +__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + return ___update_load_avg(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq); } /* @@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { + u64 p_last_update_time; + u64 n_last_update_time; + if (!sched_feat(ATTACH_AGE_LOAD)) return; @@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se, * time. This will result in the wakee task is less decayed, but giving * the wakee more load sounds not bad. */ - if (se->avg.last_update_time && prev) { - u64 p_last_update_time; - u64 n_last_update_time; + if (!(se->avg.last_update_time && prev)) + return; #ifndef CONFIG_64BIT + { u64 p_last_update_time_copy; u64 n_last_update_time_copy; @@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se, } while (p_last_update_time != p_last_update_time_copy || n_last_update_time != n_last_update_time_copy); + } #else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), - &se->avg, 0, 0, NULL); - se->avg.last_update_time = n_last_update_time; - } + __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + se->avg.last_update_time = n_last_update_time; } /* Take into account change of utilization of a child task group */ @@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) set_tg_cfs_propagate(cfs_rq); } - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); + decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { - __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed |= propagate_entity_load_avg(se); @@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); } /* @@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); if (!remaining) break; @@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void) unsigned long curr_jiffies = READ_ONCE(jiffies); struct rq *this_rq = this_rq(); unsigned long load; + struct rq_flags rf; if (curr_jiffies == this_rq->last_load_update_tick) return; load = weighted_cpuload(cpu_of(this_rq)); - raw_spin_lock(&this_rq->lock); + rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); - raw_spin_unlock(&this_rq->lock); + rq_unlock(this_rq, &rf); } #else /* !CONFIG_NO_HZ_COMMON */ static inline void cpu_load_update_nohz(struct rq *this_rq, @@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - deactivate_task(env->src_rq, p, 0); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p) */ static void attach_one_task(struct rq *rq, struct task_struct *p) { - raw_spin_lock(&rq->lock); + struct rq_flags rf; + + rq_lock(rq, &rf); + update_rq_clock(rq); attach_task(rq, p); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env) { struct list_head *tasks = &env->tasks; struct task_struct *p; + struct rq_flags rf; - raw_spin_lock(&env->dst_rq->lock); + rq_lock(env->dst_rq, &rf); + update_rq_clock(env->dst_rq); while (!list_empty(tasks)) { p = list_first_entry(tasks, struct task_struct, se.group_node); @@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - raw_spin_unlock(&env->dst_rq->lock); + rq_unlock(env->dst_rq, &rf); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); /* @@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu) * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } /* @@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } static unsigned long task_h_load(struct task_struct *p) @@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; bool overload = false; @@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); if (local_group) { sds->local = sg; - sgs = &sds->local_stat; + sgs = local; if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) @@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * the tasks on the system). */ if (prefer_sibling && sds->local && - group_has_capacity(env, &sds->local_stat) && - (sgs->sum_nr_running > 1)) { + group_has_capacity(env, local) && + (sgs->sum_nr_running > local->sum_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } @@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; - unsigned long flags; + struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { @@ -8105,7 +8139,7 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - raw_spin_lock_irqsave(&busiest->lock, flags); + rq_lock_irqsave(busiest, &rf); update_rq_clock(busiest); /* @@ -8122,14 +8156,14 @@ more_balance: * See task_rq_lock() family for the details. */ - raw_spin_unlock(&busiest->lock); + rq_unlock(busiest, &rf); if (cur_ld_moved) { attach_tasks(&env); ld_moved += cur_ld_moved; } - local_irq_restore(flags); + local_irq_restore(rf.flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -8207,6 +8241,8 @@ more_balance: sd->nr_balance_failed++; if (need_active_balance(&env)) { + unsigned long flags; + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data) struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; + struct rq_flags rf; - raw_spin_lock_irq(&busiest_rq->lock); + rq_lock_irq(busiest_rq, &rf); /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock(&busiest_rq->lock); + rq_unlock(busiest_rq, &rf); if (p) attach_one_task(target_rq, p); @@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * do the balance. */ if (time_after_eq(jiffies, rq->next_balance)) { - raw_spin_lock_irq(&rq->lock); + struct rq_flags rf; + + rq_lock_irq(rq, &rf); update_rq_clock(rq); cpu_load_update_idle(rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); + rebalance_domains(rq, CPU_IDLE); } @@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; struct rq *rq = this_rq(); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); @@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - unsigned long flags; /* * We can't change the weight of the root cgroup. @@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se; + struct sched_entity *se = tg->se[i]; + struct rq_flags rf; - se = tg->se[i]; /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - - /* Possible calls to update_curr() need rq clock */ + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(se, UPDATE_TG); update_cfs_shares(se); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } done: diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1b3c8189b286..11192e0cb122 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_AVG_CPU, false) +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + #ifdef HAVE_RT_PUSH_IPI /* * In order to avoid a thundering herd attack of CPUs that are diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9f3e40226dec..979b7341008a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq) #define RT_PUSH_IPI_EXECUTING 1 #define RT_PUSH_IPI_RESTART 2 +/* + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * On large CPU boxes, there's the case that several CPUs could schedule + * a lower priority task at the same time, in which case it will look for + * any overloaded CPUs that it could pull a task from. To do this, the runqueue + * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting + * for a single overloaded CPU's runqueue lock can produce a large latency. + * (This has actually been observed on large boxes running cyclictest). + * Instead of taking the runqueue lock of the overloaded CPU, each of the + * CPUs that scheduled a lower priority task simply sends an IPI to the + * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with + * lots of contention. The overloaded CPU will look to push its non-running + * RT task off, and if it does, it can then ignore the other IPIs coming + * in, and just pass those IPIs off to any other overloaded CPU. + * + * When a CPU schedules a lower priority task, it only sends an IPI to + * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, + * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with + * RT overloaded tasks, would cause 100 IPIs to go out at once. + * + * The overloaded RT CPU, when receiving an IPI, will try to push off its + * overloaded RT tasks and then send an IPI to the next CPU that has + * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks + * have completed. Just because a CPU may have pushed off its own overloaded + * RT task does not mean it should stop sending the IPI around to other + * overloaded CPUs. There may be another RT task waiting to run on one of + * those CPUs that are of higher priority than the one that was just + * pushed. + * + * An optimization that could possibly be made is to make a CPU array similar + * to the cpupri array mask of all running RT tasks, but for the overloaded + * case, then the IPI could be sent to only the CPU with the highest priority + * RT task waiting, and that CPU could send off further IPIs to the CPU with + * the next highest waiting task. Since the overloaded case is much less likely + * to happen, the complexity of this implementation may not be worth it. + * Instead, just send an IPI around to all overloaded CPUs. + * + * The rq->rt.push_flags holds the status of the IPI that is going around. + * A run queue can only send out a single IPI at a time. The possible flags + * for rq->rt.push_flags are: + * + * (None or zero): No IPI is going around for the current rq + * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around + * RT_PUSH_IPI_RESTART: The priority of the running task for the rq + * has changed, and the IPI should restart + * circulating the overloaded CPUs again. + * + * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated + * before sending to the next CPU. + * + * Instead of having all CPUs that schedule a lower priority task send + * an IPI to the same "first" CPU in the RT overload mask, they send it + * to the next overloaded CPU after their own CPU. This helps distribute + * the work when there's more than one overloaded CPU and multiple CPUs + * scheduling in lower priority tasks. + * + * When a rq schedules a lower priority task than what was currently + * running, the next CPU with overloaded RT tasks is examined first. + * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower + * priority task, it will send an IPI first to CPU 5, then CPU 5 will + * send to CPU 1 if it is still overloaded. CPU 1 will clear the + * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. + * + * The first CPU to notice IPI_RESTART is set, will clear that flag and then + * send an IPI to the next overloaded CPU after the rq->cpu and not the next + * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 + * schedules a lower priority task, and the IPI_RESTART gets set while the + * handling is being done on CPU 5, it will clear the flag and send it back to + * CPU 4 instead of CPU 1. + * + * Note, the above logic can be disabled by turning off the sched_feature + * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be + * taken by the CPU requesting a pull and the waiting RT task will be pulled + * by that CPU. This may be fine for machines with few CPUs. + */ static void tell_cpu_to_push(struct rq *rq) { int cpu; diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h new file mode 100644 index 000000000000..cd200d16529e --- /dev/null +++ b/kernel/sched/sched-pelt.h @@ -0,0 +1,13 @@ +/* Generated by Documentation/scheduler/sched-pelt; do not modify. */ + +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5cbf92214ad8..7808ab050599 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 #define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 -#define ENQUEUE_HEAD 0x08 -#define ENQUEUE_REPLENISH 0x10 +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 #ifdef CONFIG_SMP -#define ENQUEUE_MIGRATED 0x20 +#define ENQUEUE_MIGRATED 0x40 #else #define ENQUEUE_MIGRATED 0x00 #endif @@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { } struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); + struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -1869,6 +1928,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { + u64 total; u64 tick_delta; u64 irq_start_time; struct u64_stats_sync sync; @@ -1876,16 +1936,20 @@ struct irqtime { DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ static inline u64 irq_time_read(int cpu) { struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); - u64 *cpustat = kcpustat_cpu(cpu).cpustat; unsigned int seq; u64 total; do { seq = __u64_stats_fetch_begin(&irqtime->sync); - total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; + total = irqtime->total; } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); return total; diff --git a/kernel/softirq.c b/kernel/softirq.c index 744fa611cae0..4e09821f9d9e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -309,7 +309,7 @@ restart: account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); - tsk_restore_flags(current, old_flags, PF_MEMALLOC); + current_restore_flags(old_flags, PF_MEMALLOC); } asmlinkage __visible void do_softirq(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c8714fcb53c..21343d110296 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1176,6 +1176,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_BPF_SYSCALL diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ce3a31e8eb36..5cb5b0008d97 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, * * Returns the granularity of underlying alarm base clock */ -static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { if (!alarmtimer_get_rtcdev()) return -EINVAL; @@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) * * Provides the underlying alarm base time. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) +static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec(base->gettime()); + *tp = ktime_to_timespec64(base->gettime()); return 0; } @@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer) * Copies out the current itimerspec data */ static void alarm_timer_get(struct k_itimer *timr, - struct itimerspec *cur_setting) + struct itimerspec64 *cur_setting) { ktime_t relative_expiry_time = alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); if (ktime_to_ns(relative_expiry_time) > 0) { - cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + cur_setting->it_value = ktime_to_timespec64(relative_expiry_time); } else { cur_setting->it_value.tv_sec = 0; cur_setting->it_value.tv_nsec = 0; } - cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); + cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval); } /** @@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr) * Sets the timer to new_setting, and starts the timer. */ static int alarm_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, - struct itimerspec *old_setting) + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) { ktime_t exp; @@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, return TIMER_RETRY; /* start the timer */ - timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); - exp = timespec_to_ktime(new_setting->it_value); + timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + exp = timespec64_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now; @@ -790,13 +790,14 @@ out: * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsreq, struct timespec __user *rmtp) + struct timespec64 *tsreq, + struct timespec __user *rmtp) { enum alarmtimer_type type = clock2alarm(which_clock); + struct restart_block *restart; struct alarm alarm; ktime_t exp; int ret = 0; - struct restart_block *restart; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); - exp = timespec_to_ktime(*tsreq); + exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now = alarm_bases[type].gettime(); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 97ac0951f164..4237e0744e26 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -void clockevents_config(struct clock_event_device *dev, u32 freq) +static void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ec08f527d7ee..a7560123617c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1368,10 +1368,7 @@ retry: ktime_to_ns(delta)); } -/* - * local version of hrtimer_peek_ahead_timers() called with interrupts - * disabled. - */ +/* called with interrupts disabled */ static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1506,7 +1503,7 @@ out: return ret; } -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, +long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; @@ -1519,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); + hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); if (do_nanosleep(&t, mode)) goto out; @@ -1550,15 +1547,17 @@ out: SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct timespec64 tu64; struct timespec tu; if (copy_from_user(&tu, rqtp, sizeof(tu))) return -EFAULT; - if (!timespec_valid(&tu)) + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) return -EINVAL; - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 9cff0ab82b63..31d588d37a17 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -297,7 +297,7 @@ out: return err; } -static int pc_clock_gettime(clockid_t id, struct timespec *ts) +static int pc_clock_gettime(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts) return err; } -static int pc_clock_getres(clockid_t id, struct timespec *ts) +static int pc_clock_getres(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts) return err; } -static int pc_clock_settime(clockid_t id, const struct timespec *ts) +static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit) return err; } -static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts) { clockid_t id = kit->it_clock; struct posix_clock_desc cd; @@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) } static int pc_timer_settime(struct k_itimer *kit, int flags, - struct itimerspec *ts, struct itimerspec *old) + struct itimerspec64 *ts, struct itimerspec64 *old) { clockid_t id = kit->it_clock; struct posix_clock_desc cd; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4513ad16a253..949e434d3536 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p) } static int -posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { int error = check_clock(which_clock); if (!error) { @@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) } static int -posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) { /* * You can never reset a CPU clock, but we check for other errors @@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, static int posix_cpu_clock_get_task(struct task_struct *tsk, const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { int err = -EINVAL; u64 rtn; @@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, } if (!err) - *tp = ns_to_timespec(rtn); + *tp = ns_to_timespec64(rtn); return err; } -static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int err = -EINVAL; @@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, * and try again. (This happens when the timer is in the middle of firing.) */ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, - struct itimerspec *new, struct itimerspec *old) + struct itimerspec64 *new, struct itimerspec64 *old) { unsigned long flags; struct sighand_struct *sighand; @@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, WARN_ON_ONCE(p == NULL); - new_expires = timespec_to_ns(&new->it_value); + new_expires = timespec64_to_ns(&new->it_value); /* * Protect against sighand release/switch in exit/exec and p->cpu_timers @@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, bump_cpu_timer(timer, val); if (val < timer->it.cpu.expires) { old_expires = timer->it.cpu.expires - val; - old->it_value = ns_to_timespec(old_expires); + old->it_value = ns_to_timespec64(old_expires); } else { old->it_value.tv_nsec = 1; old->it_value.tv_sec = 0; @@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * Install the new reload setting, and * set up the signal and overrun bookkeeping. */ - timer->it.cpu.incr = timespec_to_ns(&new->it_interval); + timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); /* * This acts as a modification timestamp for the timer, @@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = 0; out: if (old) - old->it_interval = ns_to_timespec(old_incr); + old->it_interval = ns_to_timespec64(old_incr); return ret; } -static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { u64 now; struct task_struct *p = timer->it.cpu.task; @@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) /* * Easy part: convert the reload time. */ - itp->it_interval = ns_to_timespec(timer->it.cpu.incr); + itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; @@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) * Call the timer disarmed, nothing else to do. */ timer->it.cpu.expires = 0; - itp->it_value = ns_to_timespec(timer->it.cpu.expires); + itp->it_value = ns_to_timespec64(timer->it.cpu.expires); return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); @@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } if (now < timer->it.cpu.expires) { - itp->it_value = ns_to_timespec(timer->it.cpu.expires - now); + itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); } else { /* * The timer should have expired already, but the firing @@ -825,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ + pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -836,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk, soft += USEC_PER_SEC; sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } - printk(KERN_INFO - "RT Watchdog Timeout: %s[%d]\n", + pr_info("RT Watchdog Timeout (soft): %s[%d]\n", tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } @@ -935,6 +936,8 @@ static void check_process_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ + pr_info("RT Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -942,6 +945,8 @@ static void check_process_timers(struct task_struct *tsk, /* * At the soft limit, send a SIGXCPU every second. */ + pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); if (soft < hard) { soft++; @@ -1214,7 +1219,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct itimerspec *it) + struct timespec64 *rqtp, struct itimerspec64 *it) { struct k_itimer timer; int error; @@ -1229,7 +1234,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, error = posix_cpu_timer_create(&timer); timer.it_process = current; if (!error) { - static struct itimerspec zero_it; + static struct itimerspec64 zero_it; memset(it, 0, sizeof *it); it->it_value = *rqtp; @@ -1264,7 +1269,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * We were interrupted by a signal. */ - *rqtp = ns_to_timespec(timer.it.cpu.expires); + *rqtp = ns_to_timespec64(timer.it.cpu.expires); error = posix_cpu_timer_set(&timer, 0, &zero_it, it); if (!error) { /* @@ -1301,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) + struct timespec64 *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = ¤t->restart_block; - struct itimerspec it; + struct itimerspec64 it; + struct timespec ts; int error; /* @@ -1324,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, /* * Report back to the user the time still remaining. */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + ts = timespec64_to_timespec(it.it_value); + if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp))) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; restart_block->nanosleep.rmtp = rmtp; - restart_block->nanosleep.expires = timespec_to_ns(rqtp); + restart_block->nanosleep.expires = timespec64_to_ns(rqtp); } return error; } @@ -1338,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct timespec t; - struct itimerspec it; + struct itimerspec64 it; + struct timespec64 t; + struct timespec tmp; int error; - t = ns_to_timespec(restart_block->nanosleep.expires); + t = ns_to_timespec64(restart_block->nanosleep.expires); error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); @@ -1351,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) /* * Report back to the user the time still remaining. */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + tmp = timespec64_to_timespec(it.it_value); + if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp))) return -EFAULT; - restart_block->nanosleep.expires = timespec_to_ns(&t); + restart_block->nanosleep.expires = timespec64_to_ns(&t); } return error; @@ -1364,12 +1373,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) static int process_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_getres(PROCESS_CLOCK, tp); } static int process_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_get(PROCESS_CLOCK, tp); } @@ -1379,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, + struct timespec64 *rqtp, struct timespec __user *rmtp) { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); @@ -1389,12 +1398,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block) return -EINVAL; } static int thread_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_getres(THREAD_CLOCK, tp); } static int thread_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_get(THREAD_CLOCK, tp); } diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index cd6716e115e8..c0cd53eb018a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -49,26 +49,32 @@ SYS_NI(alarm); SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { + struct timespec64 new_tp64; struct timespec new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; - return do_sys_settimeofday(&new_tp, NULL); + + new_tp64 = timespec_to_timespec64(new_tp); + return do_sys_settimeofday64(&new_tp64, NULL); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { + struct timespec64 kernel_tp64; struct timespec kernel_tp; switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break; - case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break; - case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break; + case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; + case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; + case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; default: return -EINVAL; } + + kernel_tp = timespec64_to_timespec(kernel_tp64); if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) return -EFAULT; return 0; @@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct timespec64 t64; struct timespec t; switch (which_clock) { @@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, case CLOCK_BOOTTIME: if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; - if (!timespec_valid(&t)) + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) return -EINVAL; - return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); default: diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 50a6a47020de..4d7b2ce09c27 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -130,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; /* * These ones are defined below. */ -static int common_nsleep(const clockid_t, int flags, struct timespec *t, +static int common_nsleep(const clockid_t, int flags, struct timespec64 *t, struct timespec __user *rmtp); static int common_timer_create(struct k_itimer *new_timer); -static void common_timer_get(struct k_itimer *, struct itimerspec *); +static void common_timer_get(struct k_itimer *, struct itimerspec64 *); static int common_timer_set(struct k_itimer *, int, - struct itimerspec *, struct itimerspec *); + struct itimerspec64 *, struct itimerspec64 *); static int common_timer_del(struct k_itimer *timer); static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); @@ -204,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_real_ts(tp); + ktime_get_real_ts64(tp); return 0; } /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, - const struct timespec *tp) + const struct timespec64 *tp) { - return do_sys_settimeofday(tp, NULL); + return do_sys_settimeofday64(tp, NULL); } static int posix_clock_realtime_adj(const clockid_t which_clock, @@ -226,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) +static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_ts(tp); + ktime_get_ts64(tp); return 0; } /* * Get monotonic-raw time for posix timers */ -static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { - getrawmonotonic(tp); + getrawmonotonic64(tp); return 0; } -static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { - *tp = current_kernel_time(); + *tp = current_kernel_time64(); return 0; } static int posix_get_monotonic_coarse(clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { - *tp = get_monotonic_coarse(); + *tp = get_monotonic_coarse64(); return 0; } -static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp) { - *tp = ktime_to_timespec(KTIME_LOW_RES); + *tp = ktime_to_timespec64(KTIME_LOW_RES); return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) { - get_monotonic_boottime(tp); + get_monotonic_boottime64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - timekeeping_clocktai(tp); + timekeeping_clocktai64(tp); return 0; } -static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; tp->tv_nsec = hrtimer_resolution; @@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * report. */ static void -common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) +common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { ktime_t now, remaining, iv; struct hrtimer *timer = &timr->it.real.timer; - memset(cur_setting, 0, sizeof(struct itimerspec)); + memset(cur_setting, 0, sizeof(*cur_setting)); iv = timr->it.real.interval; /* interval timer ? */ if (iv) - cur_setting->it_interval = ktime_to_timespec(iv); + cur_setting->it_interval = ktime_to_timespec64(iv); else if (!hrtimer_active(timer) && (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) return; @@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) cur_setting->it_value.tv_nsec = 1; } else - cur_setting->it_value = ktime_to_timespec(remaining); + cur_setting->it_value = ktime_to_timespec64(remaining); } /* Get the time remaining on a POSIX.1b interval timer. */ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { + struct itimerspec64 cur_setting64; struct itimerspec cur_setting; struct k_itimer *timr; struct k_clock *kc; @@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else - kc->timer_get(timr, &cur_setting); + kc->timer_get(timr, &cur_setting64); unlock_timer(timr, flags); + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) return -EFAULT; @@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) /* timr->it_lock is taken. */ static int common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, struct itimerspec *old_setting) + struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) { struct hrtimer *timer = &timr->it.real.timer; enum hrtimer_mode mode; @@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags, hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; - hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); + hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); /* Convert interval */ - timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); + timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval); /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { @@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct itimerspec __user *, new_setting, struct itimerspec __user *, old_setting) { - struct k_itimer *timr; + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; struct itimerspec new_spec, old_spec; - int error = 0; + struct k_itimer *timr; unsigned long flag; - struct itimerspec *rtn = old_setting ? &old_spec : NULL; struct k_clock *kc; + int error = 0; if (!new_setting) return -EINVAL; if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) return -EFAULT; + new_spec64 = itimerspec_to_itimerspec64(&new_spec); - if (!timespec_valid(&new_spec.it_interval) || - !timespec_valid(&new_spec.it_value)) + if (!timespec64_valid(&new_spec64.it_interval) || + !timespec64_valid(&new_spec64.it_value)) return -EINVAL; retry: timr = lock_timer(timer_id, &flag); @@ -908,7 +912,7 @@ retry: if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else - error = kc->timer_set(timr, flags, &new_spec, rtn); + error = kc->timer_set(timr, flags, &new_spec64, rtn); unlock_timer(timr, flag); if (error == TIMER_RETRY) { @@ -916,6 +920,7 @@ retry: goto retry; } + old_spec = itimerspec64_to_itimerspec(&old_spec64); if (old_setting && !error && copy_to_user(old_setting, &old_spec, sizeof (old_spec))) error = -EFAULT; @@ -1014,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 new_tp64; struct timespec new_tp; if (!kc || !kc->clock_set) @@ -1021,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; + new_tp64 = timespec_to_timespec64(new_tp); - return kc->clock_set(which_clock, &new_tp); + return kc->clock_set(which_clock, &new_tp64); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 kernel_tp64; struct timespec kernel_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp); + error = kc->clock_get(which_clock, &kernel_tp64); + kernel_tp = timespec64_to_timespec(kernel_tp64); if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) error = -EFAULT; @@ -1070,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 rtn_tp64; struct timespec rtn_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp); + error = kc->clock_getres(which_clock, &rtn_tp64); + rtn_tp = timespec64_to_timespec(rtn_tp64); if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) error = -EFAULT; @@ -1088,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, * nanosleep for monotonic and realtime clocks */ static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) + struct timespec64 *tsave, struct timespec __user *rmtp) { return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, @@ -1100,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct timespec __user *, rmtp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t64; struct timespec t; if (!kc) @@ -1110,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; - if (!timespec_valid(&t)) + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) return -EINVAL; - return kc->nsleep(which_clock, flags, &t, rmtp); + return kc->nsleep(which_clock, flags, &t64, rmtp); } /* diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index ea6b610c4c57..2d8f05aad442 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) update_clock_read_data(&rd); + if (sched_clock_timer.function != NULL) { + /* update timeout for clock wrap */ + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + } + r = rate; if (r >= 4000000) { r /= 1000000; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7fe53be86077..64c97fc130c4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/time/time.c b/kernel/time/time.c index 25bdd2504571..6574bba44b55 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { + struct timespec64 new_ts; struct timeval user_tv; - struct timespec new_ts; struct timezone new_tz; if (tv) { @@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, return -EFAULT; } - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5b63a2102c29..9652bc57fd09 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, return 0; /* Interpolate shortest distance from beginning or end of history */ - interp_forward = partial_history_cycles > total_history_cycles/2 ? - true : false; + interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1dc0256bfb6e..cc6b6bdd1329 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -241,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write, int ret; mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(false); mutex_unlock(&mutex); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ff8d5c13d04b..0e7f5428a148 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -16,6 +16,7 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> +#include <linux/nmi.h> #include <linux/uaccess.h> @@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, next_one: i = 0; + + touch_nmi_watchdog(); + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = timerqueue_getnext(&base->active); @@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; + touch_nmi_watchdog(); + SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b2058a7f94bd..bd8ae8d5ae9c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q) /** * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for * @rq: the source request + * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q) * Records an action against a request. Will log the bio offset + size. * **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, +static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt = rq->q->blk_trace; if (likely(!bt)) return; @@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, rq->errors, 0, NULL); -} - -static void blk_add_trace_rq_abort(void *ignore, - struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); + rq->cmd_flags, what, error, 0, NULL); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(void *ignore, - struct request_queue *q, - struct request *rq, - unsigned int nr_bytes) +static void blk_add_trace_rq_complete(void *ignore, struct request *rq, + int error, unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); } /** @@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, + rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r); } @@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + BLK_TA_DRV_DATA, 0, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -974,8 +966,6 @@ static void blk_register_tracepoints(void) { int ret; - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); - WARN_ON(ret); ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); @@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); tracepoint_synchronize_unregister(); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cee9802cf3e0..f806dbd66de9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + if (unlikely(uaccess_kernel())) return -EPERM; if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) return -EPERM; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b9691ee8f6c1..dd3e91d68dc7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3755,23 +3755,24 @@ static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) ftrace_probe_registered = 1; } -static void __disable_ftrace_function_probe(void) +static bool __disable_ftrace_function_probe(void) { int i; if (!ftrace_probe_registered) - return; + return false; for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; if (hhd->first) - return; + return false; } /* no more funcs left */ ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; + return true; } @@ -3901,6 +3902,7 @@ static void __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { + struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; @@ -3912,6 +3914,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int i, ret; + bool disabled; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) func_g.search = NULL; @@ -3930,6 +3933,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&trace_probe_ops.func_hash->regex_lock); + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) /* Hmm, should report this somehow */ @@ -3967,12 +3974,17 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } } mutex_lock(&ftrace_lock); - __disable_ftrace_function_probe(); + disabled = __disable_ftrace_function_probe(); /* * Remove after the disable is called. Otherwise, if the last * probe is removed, a null hash means *all enabled*. */ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + /* still need to update the function call sites */ + if (ftrace_enabled && !disabled) + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + &old_hash_ops); synchronize_sched(); if (!ret) free_ftrace_hash_rcu(old_hash); @@ -5554,6 +5566,15 @@ static void clear_ftrace_pids(struct trace_array *tr) trace_free_pid_list(pid_list); } +void ftrace_clear_pids(struct trace_array *tr) +{ + mutex_lock(&ftrace_lock); + + clear_ftrace_pids(tr); + + mutex_unlock(&ftrace_lock); +} + static void ftrace_pid_reset(struct trace_array *tr) { mutex_lock(&ftrace_lock); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 54e7a90db848..ca47a4fa2986 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3405,11 +3405,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + struct buffer_page *head_page; + struct buffer_page *commit_page; + unsigned commit; cpu_buffer = iter->cpu_buffer; - return iter->head_page == cpu_buffer->commit_page && - iter->head == rb_commit_index(cpu_buffer); + /* Remember, trace recording is off when iterator is in use */ + reader = cpu_buffer->reader_page; + head_page = cpu_buffer->head_page; + commit_page = cpu_buffer->commit_page; + commit = rb_page_commit(commit_page); + + return ((iter->head_page == commit_page && iter->head == commit) || + (iter->head_page == reader && commit_page == head_page && + head_page->read == commit && + iter->head == rb_page_commit(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0ed834d6beb0..b253d59b9c51 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6734,11 +6734,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = alloc_snapshot(&global_trace); + if (ret < 0) + goto out; - if (ret >= 0) - alloc_snapshot(&global_trace); + ret = register_ftrace_function_probe(glob, ops, count); + out: return ret < 0 ? ret : 0; } @@ -7403,6 +7405,7 @@ static int instance_rmdir(const char *name) tracing_set_nop(tr); event_trace_del_tracer(tr); + ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove_recursive(tr->dir); free_trace_buffers(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ae1cce91fead..d19d52d600d6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -896,6 +896,7 @@ int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_clear_pids(struct trace_array *tr); #else static inline int ftrace_trace_task(struct trace_array *tr) { @@ -914,6 +915,7 @@ ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_clear_pids(struct trace_array *tr) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0168b7da1ea..c74bf39ef764 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool) INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; + setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout, + (unsigned long)pool); setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); @@ -4735,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); + +/** + * work_on_cpu_safe - run a function in thread context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function argument + * + * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold + * any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. + */ +long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +{ + long ret = -ENODEV; + + get_online_cpus(); + if (cpu_online(cpu)) + ret = work_on_cpu(cpu, fn, arg); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu_safe); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER |