summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/bpf/arraymap.c18
-rw-r--r--kernel/bpf/cgroup.c59
-rw-r--r--kernel/bpf/core.c14
-rw-r--r--kernel/bpf/hashtab.c22
-rw-r--r--kernel/bpf/stackmap.c20
-rw-r--r--kernel/bpf/syscall.c54
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c22
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/delayacct.c6
-rw-r--r--kernel/events/core.c535
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/extable.c9
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/devres.c65
-rw-r--r--kernel/irq/irqdomain.c44
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/jump_label.c7
-rw-r--r--kernel/kprobes.c73
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/locking/lockdep.c12
-rw-r--r--kernel/locking/locktorture.c6
-rw-r--r--kernel/locking/mutex.c24
-rw-r--r--kernel/membarrier.c4
-rw-r--r--kernel/module.c55
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/pid_namespace.c10
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcutorture.c19
-rw-r--r--kernel/rcu/srcu.c143
-rw-r--r--kernel/rcu/tiny.c6
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c295
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h90
-rw-r--r--kernel/rcu/tree_plugin.h9
-rw-r--r--kernel/rcu/tree_trace.c5
-rw-r--r--kernel/rcu/update.c44
-rw-r--r--kernel/sched/Makefile4
-rw-r--r--kernel/sched/autogroup.c (renamed from kernel/sched/auto_group.c)0
-rw-r--r--kernel/sched/autogroup.h (renamed from kernel/sched/auto_group.h)0
-rw-r--r--kernel/sched/clock.c158
-rw-r--r--kernel/sched/completion.c10
-rw-r--r--kernel/sched/core.c2333
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cputime.c178
-rw-r--r--kernel/sched/deadline.c13
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c94
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h137
-rw-r--r--kernel/sched/stats.h36
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/topology.c1658
-rw-r--r--kernel/signal.c12
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/sys.c16
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/hrtimer.c58
-rw-r--r--kernel/time/itimer.c60
-rw-r--r--kernel/time/jiffies.c32
-rw-r--r--kernel/time/posix-cpu-timers.c170
-rw-r--r--kernel/time/tick-broadcast.c30
-rw-r--r--kernel/time/time.c10
-rw-r--r--kernel/time/timeconst.bc6
-rw-r--r--kernel/time/timekeeping.c39
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/time/timekeeping_debug.c4
-rw-r--r--kernel/time/timer.c48
-rw-r--r--kernel/time/timer_list.c12
-rw-r--r--kernel/time/timer_stats.c425
-rw-r--r--kernel/trace/trace_hwlat.c8
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/tsacct.c21
-rw-r--r--kernel/ucount.c17
-rw-r--r--kernel/watchdog.c9
-rw-r--r--kernel/watchdog_hld.c3
-rw-r--r--kernel/workqueue.c2
87 files changed, 3802 insertions, 3613 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 74963d192c5d..ca9cb55b5855 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -453,8 +453,8 @@ static void fill_ac(acct_t *ac)
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
- ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
- ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
+ ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
ac->ac_flag = pacct->ac_flag;
ac->ac_mem = encode_comp_t(pacct->ac_mem);
ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
@@ -530,7 +530,7 @@ out:
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
- cputime_t utime, stime;
+ u64 utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
@@ -559,6 +559,7 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
+
task_cputime(current, &utime, &stime);
pacct->ac_utime += utime;
pacct->ac_stime += stime;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 229a5d5df977..3d55d95dcf49 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,7 +11,6 @@
*/
#include <linux/bpf.h>
#include <linux/err.h>
-#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
@@ -74,14 +73,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (array_size >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
-
/* allocate all map elements and zero-initialize them */
- array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
- if (!array) {
- array = vzalloc(array_size);
- if (!array)
- return ERR_PTR(-ENOMEM);
- }
+ array = bpf_map_area_alloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -97,7 +92,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (array_size >= U32_MAX - PAGE_SIZE ||
elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
- kvfree(array);
+ bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
out:
@@ -262,7 +257,7 @@ static void array_map_free(struct bpf_map *map)
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
- kvfree(array);
+ bpf_map_area_free(array);
}
static const struct bpf_map_ops array_ops = {
@@ -319,7 +314,8 @@ static void fd_array_map_free(struct bpf_map *map)
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
- kvfree(array);
+
+ bpf_map_area_free(array);
}
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a515f7b007c6..da0f53690295 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -52,6 +52,7 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
e = rcu_dereference_protected(parent->bpf.effective[type],
lockdep_is_held(&cgroup_mutex));
rcu_assign_pointer(cgrp->bpf.effective[type], e);
+ cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
}
}
@@ -82,30 +83,63 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
*
* Must be called with cgroup_mutex held.
*/
-void __cgroup_bpf_update(struct cgroup *cgrp,
- struct cgroup *parent,
- struct bpf_prog *prog,
- enum bpf_attach_type type)
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+ struct bpf_prog *prog, enum bpf_attach_type type,
+ bool new_overridable)
{
- struct bpf_prog *old_prog, *effective;
+ struct bpf_prog *old_prog, *effective = NULL;
struct cgroup_subsys_state *pos;
+ bool overridable = true;
- old_prog = xchg(cgrp->bpf.prog + type, prog);
+ if (parent) {
+ overridable = !parent->bpf.disallow_override[type];
+ effective = rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ }
+
+ if (prog && effective && !overridable)
+ /* if parent has non-overridable prog attached, disallow
+ * attaching new programs to descendent cgroup
+ */
+ return -EPERM;
+
+ if (prog && effective && overridable != new_overridable)
+ /* if parent has overridable prog attached, only
+ * allow overridable programs in descendent cgroup
+ */
+ return -EPERM;
- effective = (!prog && parent) ?
- rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex)) :
- prog;
+ old_prog = cgrp->bpf.prog[type];
+
+ if (prog) {
+ overridable = new_overridable;
+ effective = prog;
+ if (old_prog &&
+ cgrp->bpf.disallow_override[type] == new_overridable)
+ /* disallow attaching non-overridable on top
+ * of existing overridable in this cgroup
+ * and vice versa
+ */
+ return -EPERM;
+ }
+
+ if (!prog && !old_prog)
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+
+ cgrp->bpf.prog[type] = prog;
css_for_each_descendant_pre(pos, &cgrp->self) {
struct cgroup *desc = container_of(pos, struct cgroup, self);
/* skip the subtree if the descendant has its own program */
- if (desc->bpf.prog[type] && desc != cgrp)
+ if (desc->bpf.prog[type] && desc != cgrp) {
pos = css_rightmost_descendant(pos);
- else
+ } else {
rcu_assign_pointer(desc->bpf.effective[type],
effective);
+ desc->bpf.disallow_override[type] = !overridable;
+ }
}
if (prog)
@@ -115,6 +149,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
}
+ return 0;
}
/**
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1eb4f1303756..503d4211988a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -146,10 +146,11 @@ void __bpf_prog_free(struct bpf_prog *fp)
vfree(fp);
}
-int bpf_prog_calc_digest(struct bpf_prog *fp)
+int bpf_prog_calc_tag(struct bpf_prog *fp)
{
const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
- u32 raw_size = bpf_prog_digest_scratch_size(fp);
+ u32 raw_size = bpf_prog_tag_scratch_size(fp);
+ u32 digest[SHA_DIGEST_WORDS];
u32 ws[SHA_WORKSPACE_WORDS];
u32 i, bsize, psize, blocks;
struct bpf_insn *dst;
@@ -162,7 +163,7 @@ int bpf_prog_calc_digest(struct bpf_prog *fp)
if (!raw)
return -ENOMEM;
- sha_init(fp->digest);
+ sha_init(digest);
memset(ws, 0, sizeof(ws));
/* We need to take out the map fd for the digest calculation
@@ -204,13 +205,14 @@ int bpf_prog_calc_digest(struct bpf_prog *fp)
*bits = cpu_to_be64((psize - 1) << 3);
while (blocks--) {
- sha_transform(fp->digest, todo, ws);
+ sha_transform(digest, todo, ws);
todo += SHA_MESSAGE_BYTES;
}
- result = (__force __be32 *)fp->digest;
+ result = (__force __be32 *)digest;
for (i = 0; i < SHA_DIGEST_WORDS; i++)
- result[i] = cpu_to_be32(fp->digest[i]);
+ result[i] = cpu_to_be32(digest[i]);
+ memcpy(fp->tag, result, sizeof(fp->tag));
vfree(raw);
return 0;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3f2bb58952d8..a753bbe7df0a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,7 +13,6 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
@@ -103,7 +102,7 @@ static void htab_free_elems(struct bpf_htab *htab)
free_percpu(pptr);
}
free_elems:
- vfree(htab->elems);
+ bpf_map_area_free(htab->elems);
}
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
@@ -125,7 +124,8 @@ static int prealloc_init(struct bpf_htab *htab)
{
int err = -ENOMEM, i;
- htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+ htab->elems = bpf_map_area_alloc(htab->elem_size *
+ htab->map.max_entries);
if (!htab->elems)
return -ENOMEM;
@@ -320,14 +320,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_htab;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
- GFP_USER | __GFP_NOWARN);
-
- if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
- if (!htab->buckets)
- goto free_htab;
- }
+ htab->buckets = bpf_map_area_alloc(htab->n_buckets *
+ sizeof(struct bucket));
+ if (!htab->buckets)
+ goto free_htab;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_HEAD(&htab->buckets[i].head);
@@ -354,7 +350,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_extra_elems:
free_percpu(htab->extra_elems);
free_buckets:
- kvfree(htab->buckets);
+ bpf_map_area_free(htab->buckets);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -1014,7 +1010,7 @@ static void htab_map_free(struct bpf_map *map)
prealloc_destroy(htab);
free_percpu(htab->extra_elems);
- kvfree(htab->buckets);
+ bpf_map_area_free(htab->buckets);
kfree(htab);
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 732ae16d12b7..be8519148c25 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,7 +7,6 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
#include "percpu_freelist.h"
@@ -32,7 +31,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
- smap->elems = vzalloc(elem_size * smap->map.max_entries);
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
if (!smap->elems)
return -ENOMEM;
@@ -45,7 +44,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
return 0;
free_elems:
- vfree(smap->elems);
+ bpf_map_area_free(smap->elems);
return err;
}
@@ -76,12 +75,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
- smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
- if (!smap) {
- smap = vzalloc(cost);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- }
+ smap = bpf_map_area_alloc(cost);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
err = -E2BIG;
cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
@@ -112,7 +108,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
put_buffers:
put_callchain_buffers();
free_smap:
- kvfree(smap);
+ bpf_map_area_free(smap);
return ERR_PTR(err);
}
@@ -262,9 +258,9 @@ static void stack_map_free(struct bpf_map *map)
/* wait for bpf programs to complete before freeing stack map */
synchronize_rcu();
- vfree(smap->elems);
+ bpf_map_area_free(smap->elems);
pcpu_freelist_destroy(&smap->freelist);
- kvfree(smap);
+ bpf_map_area_free(smap);
put_callchain_buffers();
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e89acea22ecf..bbb016adbaeb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -12,6 +12,8 @@
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/license.h>
@@ -49,6 +51,30 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+void *bpf_map_area_alloc(size_t size)
+{
+ /* We definitely need __GFP_NORETRY, so OOM killer doesn't
+ * trigger under memory pressure as we really just want to
+ * fail instead.
+ */
+ const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+ void *area;
+
+ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ area = kmalloc(size, GFP_USER | flags);
+ if (area != NULL)
+ return area;
+ }
+
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags,
+ PAGE_KERNEL);
+}
+
+void bpf_map_area_free(void *area)
+{
+ kvfree(area);
+}
+
int bpf_map_precharge_memlock(u32 pages)
{
struct user_struct *user = get_current_user();
@@ -688,17 +714,17 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
- char prog_digest[sizeof(prog->digest) * 2 + 1] = { };
+ char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_digest, prog->digest, sizeof(prog->digest));
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
- "prog_digest:\t%s\n"
+ "prog_tag:\t%s\n"
"memlock:\t%llu\n",
prog->type,
prog->jited,
- prog_digest,
+ prog_tag,
prog->pages * 1ULL << PAGE_SHIFT);
}
#endif
@@ -894,13 +920,14 @@ static int bpf_obj_get(const union bpf_attr *attr)
#ifdef CONFIG_CGROUP_BPF
-#define BPF_PROG_ATTACH_LAST_FIELD attach_type
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
static int bpf_prog_attach(const union bpf_attr *attr)
{
+ enum bpf_prog_type ptype;
struct bpf_prog *prog;
struct cgroup *cgrp;
- enum bpf_prog_type ptype;
+ int ret;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -908,6 +935,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
+ if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ return -EINVAL;
+
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
@@ -930,10 +960,13 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return PTR_ERR(cgrp);
}
- cgroup_bpf_update(cgrp, prog, attr->attach_type);
+ ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
+ attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ if (ret)
+ bpf_prog_put(prog);
cgroup_put(cgrp);
- return 0;
+ return ret;
}
#define BPF_PROG_DETACH_LAST_FIELD attach_type
@@ -941,6 +974,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr)
{
struct cgroup *cgrp;
+ int ret;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -956,7 +990,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
- cgroup_bpf_update(cgrp, NULL, attr->attach_type);
+ ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
cgroup_put(cgrp);
break;
@@ -964,7 +998,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return -EINVAL;
}
- return 0;
+ return ret;
}
#endif /* CONFIG_CGROUP_BPF */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83ed2f8f6f22..cdc43b899f28 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2936,7 +2936,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
int insn_cnt = env->prog->len;
int i, j, err;
- err = bpf_prog_calc_digest(env->prog);
+ err = bpf_prog_calc_tag(env->prog);
if (err)
return err;
diff --git a/kernel/capability.c b/kernel/capability.c
index a98e814f216f..f97fe77ceb88 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -318,6 +318,7 @@ bool has_capability(struct task_struct *t, int cap)
{
return has_ns_capability(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability);
/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2ee9ec3051b2..53bbca7c4859 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5221,6 +5221,11 @@ err_free_css:
return ERR_PTR(err);
}
+/*
+ * The returned cgroup is fully initialized including its control mask, but
+ * it isn't associated with its kernfs_node and doesn't have the control
+ * mask applied.
+ */
static struct cgroup *cgroup_create(struct cgroup *parent)
{
struct cgroup_root *root = parent->root;
@@ -5288,11 +5293,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgroup_propagate_control(cgrp);
- /* @cgrp doesn't have dir yet so the following will only create csses */
- ret = cgroup_apply_control_enable(cgrp);
- if (ret)
- goto out_destroy;
-
return cgrp;
out_cancel_ref:
@@ -5300,9 +5300,6 @@ out_cancel_ref:
out_free_cgrp:
kfree(cgrp);
return ERR_PTR(ret);
-out_destroy:
- cgroup_destroy_locked(cgrp);
- return ERR_PTR(ret);
}
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
@@ -6501,15 +6498,16 @@ static __init int cgroup_namespaces_init(void)
subsys_initcall(cgroup_namespaces_init);
#ifdef CONFIG_CGROUP_BPF
-void cgroup_bpf_update(struct cgroup *cgrp,
- struct bpf_prog *prog,
- enum bpf_attach_type type)
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, bool overridable)
{
struct cgroup *parent = cgroup_parent(cgrp);
+ int ret;
mutex_lock(&cgroup_mutex);
- __cgroup_bpf_update(cgrp, parent, prog, type);
+ ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
mutex_unlock(&cgroup_mutex);
+ return ret;
}
#endif /* CONFIG_CGROUP_BPF */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f75c4d031eeb..0a5f630f5c54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -764,7 +764,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int prev_state, ret = 0;
- bool hasdied = false;
if (num_online_cpus() == 1)
return -EBUSY;
@@ -809,7 +808,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_kick_ap_work(cpu);
}
- hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
out:
cpu_hotplug_done();
return ret;
@@ -1302,10 +1300,24 @@ static int cpuhp_cb_check(enum cpuhp_state state)
*/
static int cpuhp_reserve_state(enum cpuhp_state state)
{
- enum cpuhp_state i;
+ enum cpuhp_state i, end;
+ struct cpuhp_step *step;
- for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
- if (!cpuhp_ap_states[i].name)
+ switch (state) {
+ case CPUHP_AP_ONLINE_DYN:
+ step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN;
+ end = CPUHP_AP_ONLINE_DYN_END;
+ break;
+ case CPUHP_BP_PREPARE_DYN:
+ step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN;
+ end = CPUHP_BP_PREPARE_DYN_END;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ for (i = state; i <= end; i++, step++) {
+ if (!step->name)
return i;
}
WARN(1, "No more dynamic states available for CPU hotplug\n");
@@ -1323,7 +1335,7 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
mutex_lock(&cpuhp_state_mutex);
- if (state == CPUHP_AP_ONLINE_DYN) {
+ if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
ret = cpuhp_reserve_state(state);
if (ret < 0)
goto out;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 435c14a45118..660549656991 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -82,19 +82,19 @@ void __delayacct_blkio_end(void)
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
- cputime_t utime, stime, stimescaled, utimescaled;
+ u64 utime, stime, stimescaled, utimescaled;
unsigned long long t2, t3;
unsigned long flags, t1;
s64 tmp;
task_cputime(tsk, &utime, &stime);
tmp = (s64)d->cpu_run_real_total;
- tmp += cputime_to_nsecs(utime + stime);
+ tmp += utime + stime;
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
tmp = (s64)d->cpu_scaled_run_real_total;
- tmp += cputime_to_nsecs(utimescaled + stimescaled);
+ tmp += utimescaled + stimescaled;
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ab15509fab8c..77a932b54a64 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -355,6 +355,8 @@ enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_TIME = 0x4,
+ /* see ctx_resched() for details */
+ EVENT_CPU = 0x8,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -678,6 +680,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
info->timestamp = ctx->timestamp;
}
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
@@ -690,61 +694,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
static void perf_cgroup_switch(struct task_struct *task, int mode)
{
struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ struct list_head *list;
unsigned long flags;
/*
- * disable interrupts to avoid geting nr_cgroup
- * changes via __perf_event_disable(). Also
- * avoids preemption.
+ * Disable interrupts and preemption to avoid this CPU's
+ * cgrp_cpuctx_entry to change under us.
*/
local_irq_save(flags);
- /*
- * we reschedule only in the presence of cgroup
- * constrained events.
- */
+ list = this_cpu_ptr(&cgrp_cpuctx_list);
+ list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- continue; /* ensure we process each cpuctx once */
-
- /*
- * perf_cgroup_events says at least one
- * context on this CPU has cgroup events.
- *
- * ctx->nr_cgroups reports the number of cgroup
- * events for a context.
- */
- if (cpuctx->ctx.nr_cgroups > 0) {
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
+ if (mode & PERF_CGROUP_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->cgrp = NULL;
+ }
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgorup events are only per-cpu
- */
- cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ if (mode & PERF_CGROUP_SWIN) {
+ WARN_ON_ONCE(cpuctx->cgrp);
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
+ * we pass the cpuctx->ctx to perf_cgroup_from_task()
+ * because cgorup events are only per-cpu
+ */
+ cpuctx->cgrp = perf_cgroup_from_task(task,
+ &cpuctx->ctx);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
local_irq_restore(flags);
@@ -889,6 +878,7 @@ list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
struct perf_cpu_context *cpuctx;
+ struct list_head *cpuctx_entry;
if (!is_cgroup_event(event))
return;
@@ -902,15 +892,16 @@ list_update_cgroup_event(struct perf_event *event,
* this will always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);
-
- /*
- * cpuctx->cgrp is NULL until a cgroup event is sched in or
- * ctx->nr_cgroup == 0 .
- */
- if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
- cpuctx->cgrp = event->cgrp;
- else if (!add)
+ cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+ /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+ if (add) {
+ list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+ cpuctx->cgrp = event->cgrp;
+ } else {
+ list_del(cpuctx_entry);
cpuctx->cgrp = NULL;
+ }
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1453,6 +1444,20 @@ static void update_group_times(struct perf_event *leader)
update_event_times(event);
}
+static enum event_type_t get_event_type(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ enum event_type_t event_type;
+
+ lockdep_assert_held(&ctx->lock);
+
+ event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
+ if (!ctx->task)
+ event_type |= EVENT_CPU;
+
+ return event_type;
+}
+
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -1469,7 +1474,6 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
-
lockdep_assert_held(&ctx->lock);
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1624,6 +1628,8 @@ static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
* We can have double attach due to group movement in perf_event_open.
*/
@@ -1697,6 +1703,8 @@ static void perf_group_detach(struct perf_event *event)
struct perf_event *sibling, *tmp;
struct list_head *list = NULL;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -1895,9 +1903,29 @@ __perf_remove_from_context(struct perf_event *event,
*/
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
- lockdep_assert_held(&event->ctx->mutex);
+ struct perf_event_context *ctx = event->ctx;
+
+ lockdep_assert_held(&ctx->mutex);
event_function_call(event, __perf_remove_from_context, (void *)flags);
+
+ /*
+ * The above event_function_call() can NO-OP when it hits
+ * TASK_TOMBSTONE. In that case we must already have been detached
+ * from the context (by perf_event_exit_event()) but the grouping
+ * might still be in-tact.
+ */
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ if ((flags & DETACH_GROUP) &&
+ (event->attach_state & PERF_ATTACH_GROUP)) {
+ /*
+ * Since in that case we cannot possibly be scheduled, simply
+ * detach now.
+ */
+ raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
+ raw_spin_unlock_irq(&ctx->lock);
+ }
}
/*
@@ -2203,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task);
static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
if (!cpuctx->task_ctx)
return;
@@ -2211,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ ctx_sched_out(ctx, cpuctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2226,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}
+/*
+ * We want to maintain the following priority of scheduling:
+ * - CPU pinned (EVENT_CPU | EVENT_PINNED)
+ * - task pinned (EVENT_PINNED)
+ * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
+ * - task flexible (EVENT_FLEXIBLE).
+ *
+ * In order to avoid unscheduling and scheduling back in everything every
+ * time an event is added, only do it for the groups of equal priority and
+ * below.
+ *
+ * This can be called after a batch operation on task events, in which case
+ * event_type is a bit mask of the types of events involved. For CPU events,
+ * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
- struct perf_event_context *task_ctx)
+ struct perf_event_context *task_ctx,
+ enum event_type_t event_type)
{
+ enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+ bool cpu_event = !!(event_type & EVENT_CPU);
+
+ /*
+ * If pinned groups are involved, flexible groups also need to be
+ * scheduled out.
+ */
+ if (event_type & EVENT_PINNED)
+ event_type |= EVENT_FLEXIBLE;
+
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx);
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ task_ctx_sched_out(cpuctx, task_ctx, event_type);
+
+ /*
+ * Decide which cpu ctx groups to schedule out based on the types
+ * of events that caused rescheduling:
+ * - EVENT_CPU: schedule out corresponding groups;
+ * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
+ * - otherwise, do nothing more.
+ */
+ if (cpu_event)
+ cpu_ctx_sched_out(cpuctx, ctx_event_type);
+ else if (ctx_event_type & EVENT_PINNED)
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
}
@@ -2249,7 +2316,7 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- bool activate = true;
+ bool reprogram = true;
int ret = 0;
raw_spin_lock(&cpuctx->ctx.lock);
@@ -2257,30 +2324,29 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- /* If we're on the wrong CPU, try again */
- if (task_cpu(ctx->task) != smp_processor_id()) {
- ret = -ESRCH;
- goto unlock;
- }
+ reprogram = (ctx->task == current);
/*
- * If we're on the right CPU, see if the task we target is
- * current, if not we don't have to activate the ctx, a future
- * context switch will do that for us.
+ * If the task is running, it must be running on this CPU,
+ * otherwise we cannot reprogram things.
+ *
+ * If its not running, we don't care, ctx->lock will
+ * serialize against it becoming runnable.
*/
- if (ctx->task != current)
- activate = false;
- else
- WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+ if (task_curr(ctx->task) && !reprogram) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+ WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
} else if (task_ctx) {
raw_spin_lock(&task_ctx->lock);
}
- if (activate) {
+ if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
@@ -2328,13 +2394,36 @@ perf_install_in_context(struct perf_event_context *ctx,
/*
* Installing events is tricky because we cannot rely on ctx->is_active
* to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * Instead we use task_curr(), which tells us if the task is running.
+ * However, since we use task_curr() outside of rq::lock, we can race
+ * against the actual state. This means the result can be wrong.
+ *
+ * If we get a false positive, we retry, this is harmless.
+ *
+ * If we get a false negative, things are complicated. If we are after
+ * perf_event_context_sched_in() ctx::lock will serialize us, and the
+ * value must be correct. If we're before, it doesn't matter since
+ * perf_event_context_sched_in() will program the counter.
+ *
+ * However, this hinges on the remote context switch having observed
+ * our task->perf_event_ctxp[] store, such that it will in fact take
+ * ctx::lock in perf_event_context_sched_in().
+ *
+ * We do this by task_function_call(), if the IPI fails to hit the task
+ * we know any future context switch of task must see the
+ * perf_event_ctpx[] store.
*/
-again:
+
/*
- * Cannot use task_function_call() because we need to run on the task's
- * CPU regardless of whether its current or not.
+ * This smp_mb() orders the task->perf_event_ctxp[] store with the
+ * task_cpu() load, such that if the IPI then does not find the task
+ * running, a future context switch of that task must observe the
+ * store.
*/
- if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ smp_mb();
+again:
+ if (!task_function_call(task, __perf_install_in_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -2348,12 +2437,16 @@ again:
raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_unlock_irq(&ctx->lock);
/*
- * Since !ctx->is_active doesn't mean anything, we must IPI
- * unconditionally.
+ * If the task is not running, ctx->lock will avoid it becoming so,
+ * thus we can safely install the event.
*/
- goto again;
+ if (task_curr(task)) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
}
/*
@@ -2420,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, get_event_type(event));
}
/*
@@ -2847,7 +2940,7 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(cpuctx, ctx);
+ task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
raw_spin_unlock(&ctx->lock);
}
}
@@ -2894,7 +2987,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
return;
list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
if (WARN_ON_ONCE(!pmu->sched_task))
continue;
@@ -3084,8 +3177,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
* cpu flexible, task flexible.
+ *
+ * However, if task's ctx is not carrying any pinned
+ * events, no need to flip the cpuctx's events around.
*/
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (!list_empty(&ctx->pinned_groups))
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
@@ -3400,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event,
static void perf_event_enable_on_exec(int ctxn)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
+ enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
unsigned long flags;
@@ -3413,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn)
cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- list_for_each_entry(event, &ctx->event_list, event_entry)
+ list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
+ event_type |= get_event_type(event);
+ }
/*
* Unclone and reschedule this context if we enabled any event.
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx);
+ ctx_resched(cpuctx, ctx, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -3438,14 +3538,15 @@ struct perf_read_data {
int ret;
};
-static int find_cpu_to_read(struct perf_event *event, int local_cpu)
+static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
- int event_cpu = event->oncpu;
u16 local_pkg, event_pkg;
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
- event_pkg = topology_physical_package_id(event_cpu);
- local_pkg = topology_physical_package_id(local_cpu);
+ int local_cpu = smp_processor_id();
+
+ event_pkg = topology_physical_package_id(event_cpu);
+ local_pkg = topology_physical_package_id(local_cpu);
if (event_pkg == local_pkg)
return local_cpu;
@@ -3575,7 +3676,7 @@ u64 perf_event_read_local(struct perf_event *event)
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0, cpu_to_read, local_cpu;
+ int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
@@ -3588,21 +3689,25 @@ static int perf_event_read(struct perf_event *event, bool group)
.ret = 0,
};
- local_cpu = get_cpu();
- cpu_to_read = find_cpu_to_read(event, local_cpu);
- put_cpu();
+ event_cpu = READ_ONCE(event->oncpu);
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return 0;
+
+ preempt_disable();
+ event_cpu = __perf_event_read_cpu(event, event_cpu);
/*
* Purposely ignore the smp_call_function_single() return
* value.
*
- * If event->oncpu isn't a valid CPU it means the event got
+ * If event_cpu isn't a valid CPU it means the event got
* scheduled out and that will have updated the event count.
*
* Therefore, either way, we'll have an up-to-date event count
* after this.
*/
- (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
+ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
+ preempt_enable();
ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
@@ -6583,6 +6688,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
char *buf = NULL;
char *name;
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
if (file) {
struct inode *inode;
dev_t dev;
@@ -6609,27 +6735,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
maj = MAJOR(dev);
min = MINOR(dev);
- if (vma->vm_flags & VM_READ)
- prot |= PROT_READ;
- if (vma->vm_flags & VM_WRITE)
- prot |= PROT_WRITE;
- if (vma->vm_flags & VM_EXEC)
- prot |= PROT_EXEC;
-
- if (vma->vm_flags & VM_MAYSHARE)
- flags = MAP_SHARED;
- else
- flags = MAP_PRIVATE;
-
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
- flags |= MAP_HUGETLB;
-
goto got_name;
} else {
if (vma->vm_ops && vma->vm_ops->name) {
@@ -7034,25 +7139,12 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+static int
+__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;
seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -7080,6 +7172,34 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}
+ return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -7975,6 +8095,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
if (task == TASK_TOMBSTONE)
return;
+ if (!ifh->nr_file_filters)
+ return;
+
mm = get_task_mm(event->ctx->task);
if (!mm)
goto restart;
@@ -8145,6 +8268,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
* attribute.
*/
if (state == IF_STATE_END) {
+ ret = -EINVAL;
if (kernel && event->attr.exclude_kernel)
goto fail;
@@ -8152,6 +8276,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
if (!filename)
goto fail;
+ /*
+ * For now, we only support file-based filters
+ * in per-task events; doing so for CPU-wide
+ * events requires additional context switching
+ * trickery, since same object code will be
+ * mapped at different virtual addresses in
+ * different processes.
+ */
+ ret = -EOPNOTSUPP;
+ if (!event->ctx->task)
+ goto fail_free_name;
+
/* look up the path and grab its inode */
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
@@ -8167,6 +8303,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
!S_ISREG(filter->inode->i_mode))
/* free_filters_list() will iput() */
goto fail;
+
+ event->addr_filters.nr_file_filters++;
}
/* ready to consume more filters */
@@ -8206,24 +8344,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
if (WARN_ON_ONCE(event->parent))
return -EINVAL;
- /*
- * For now, we only support filtering in per-task events; doing so
- * for CPU-wide events requires additional context switching trickery,
- * since same object code will be mapped at different virtual
- * addresses in different processes.
- */
- if (!event->ctx->task)
- return -EOPNOTSUPP;
-
ret = perf_event_parse_addr_filter(event, filter_str, &filters);
if (ret)
- return ret;
+ goto fail_clear_files;
ret = event->pmu->addr_filters_validate(&filters);
- if (ret) {
- free_filters_list(&filters);
- return ret;
- }
+ if (ret)
+ goto fail_free_filters;
/* remove existing filters, if any */
perf_addr_filters_splice(event, &filters);
@@ -8232,6 +8359,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
perf_event_for_each_child(event, perf_event_addr_filters_apply);
return ret;
+
+fail_free_filters:
+ free_filters_list(&filters);
+
+fail_clear_files:
+ event->addr_filters.nr_file_filters = 0;
+
+ return ret;
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -8583,37 +8718,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
return NULL;
}
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
- if (cpuctx->unique_pmu == old_pmu)
- cpuctx->unique_pmu = pmu;
- }
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- struct pmu *i;
-
mutex_lock(&pmus_lock);
- /*
- * Like a real lame refcount.
- */
- list_for_each_entry(i, &pmus, entry) {
- if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
- update_pmu_context(i, pmu);
- goto out;
- }
- }
-
free_percpu(pmu->pmu_cpu_context);
-out:
mutex_unlock(&pmus_lock);
}
@@ -8817,8 +8925,6 @@ skip_type:
cpuctx->ctx.pmu = pmu;
__perf_mux_hrtimer_init(cpuctx, cpu);
-
- cpuctx->unique_pmu = pmu;
}
got_cpu_context:
@@ -8936,6 +9042,14 @@ static struct pmu *perf_init_event(struct perf_event *event)
idx = srcu_read_lock(&pmus_srcu);
+ /* Try parent's PMU first: */
+ if (event->parent && event->parent->pmu) {
+ pmu = event->parent->pmu;
+ ret = perf_try_init_event(pmu, event);
+ if (!ret)
+ goto unlock;
+ }
+
rcu_read_lock();
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
@@ -9503,6 +9617,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
+/*
+ * Variation on perf_event_ctx_lock_nested(), except we take two context
+ * mutexes.
+ */
+static struct perf_event_context *
+__perf_event_ctx_lock_double(struct perf_event *group_leader,
+ struct perf_event_context *ctx)
+{
+ struct perf_event_context *gctx;
+
+again:
+ rcu_read_lock();
+ gctx = READ_ONCE(group_leader->ctx);
+ if (!atomic_inc_not_zero(&gctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ if (group_leader->ctx != gctx) {
+ mutex_unlock(&ctx->mutex);
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ goto again;
+ }
+
+ return gctx;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -9746,12 +9891,31 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (move_group) {
- gctx = group_leader->ctx;
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ gctx = __perf_event_ctx_lock_double(group_leader, ctx);
+
if (gctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
goto err_locked;
}
+
+ /*
+ * Check if we raced against another sys_perf_event_open() call
+ * moving the software group underneath us.
+ */
+ if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * If someone moved the group out from under us, check
+ * if this new event wound up on the same ctx, if so
+ * its the regular !move_group case, otherwise fail.
+ */
+ if (gctx != ctx) {
+ err = -EINVAL;
+ goto err_locked;
+ } else {
+ perf_event_ctx_unlock(group_leader, gctx);
+ move_group = 0;
+ }
+ }
} else {
mutex_lock(&ctx->mutex);
}
@@ -9853,7 +10017,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_unpin_context(ctx);
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -9879,7 +10043,7 @@ SYSCALL_DEFINE5(perf_event_open,
err_locked:
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
/* err_file: */
fput(event_file);
@@ -10146,7 +10310,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+ task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
@@ -10595,6 +10759,9 @@ static void __init perf_event_init_all_cpus(void)
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+#ifdef CONFIG_CGROUP_PERF
+ INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
diff --git a/kernel/exit.c b/kernel/exit.c
index a9441da69e29..b67c57faa705 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk)
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *uninitialized_var(tty);
- cputime_t utime, stime;
+ u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
@@ -1121,7 +1121,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
struct signal_struct *sig = p->signal;
struct signal_struct *psig = current->signal;
unsigned long maxrss;
- cputime_t tgutime, tgstime;
+ u64 tgutime, tgstime;
/*
* The resource counters for the group leader are in its
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4a2339..e1359474baa5 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <asm/sections.h>
#include <linux/uaccess.h>
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
return 1;
if (is_ftrace_trampoline(addr))
return 1;
+ if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+ return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
return 1;
if (is_module_text_address(addr))
return 1;
- return is_ftrace_trampoline(addr);
+ if (is_ftrace_trampoline(addr))
+ return 1;
+ if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+ return 1;
+ return 0;
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index ea33f8adc032..ff82e24573b6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1306,6 +1306,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
}
}
+#ifdef CONFIG_POSIX_TIMERS
/*
* Initialize POSIX timer handling for a thread group.
*/
@@ -1315,7 +1316,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
- sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
+ sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
sig->cputimer.running = true;
}
@@ -1324,6 +1325,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
INIT_LIST_HEAD(&sig->cpu_timers[1]);
INIT_LIST_HEAD(&sig->cpu_timers[2]);
}
+#else
+static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
+#endif
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
@@ -1348,11 +1352,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_waitqueue_head(&sig->wait_chldexit);
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
- INIT_LIST_HEAD(&sig->posix_timers);
seqlock_init(&sig->stats_lock);
prev_cputime_init(&sig->prev_cputime);
#ifdef CONFIG_POSIX_TIMERS
+ INIT_LIST_HEAD(&sig->posix_timers);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
#endif
@@ -1427,6 +1431,7 @@ static void rt_mutex_init_task(struct task_struct *p)
#endif
}
+#ifdef CONFIG_POSIX_TIMERS
/*
* Initialize POSIX timer handling for a single task.
*/
@@ -1439,6 +1444,9 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
INIT_LIST_HEAD(&tsk->cpu_timers[1]);
INIT_LIST_HEAD(&tsk->cpu_timers[2]);
}
+#else
+static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
+#endif
static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
diff --git a/kernel/futex.c b/kernel/futex.c
index 0842c8ca534b..cdf365036141 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3323,4 +3323,4 @@ static int __init futex_init(void)
return 0;
}
-__initcall(futex_init);
+core_initcall(futex_init);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 74d90a754268..1613bfd48365 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -2,6 +2,7 @@
#include <linux/interrupt.h>
#include <linux/device.h>
#include <linux/gfp.h>
+#include <linux/irq.h>
/*
* Device resource management aware IRQ request/free implementation.
@@ -33,7 +34,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
* @thread_fn: function to be called in a threaded interrupt context. NULL
* for devices which handle everything in @handler
* @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
* @dev_id: A cookie passed back to the handler function
*
* Except for the extra @dev argument, this function takes the
@@ -57,6 +58,9 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
+ if (!devname)
+ devname = dev_name(dev);
+
rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
dev_id);
if (rc) {
@@ -80,7 +84,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
* @thread_fn: function to be called in a threaded interrupt context. NULL
* for devices which handle everything in @handler
* @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
* @dev_id: A cookie passed back to the handler function
*
* Except for the extra @dev argument, this function takes the
@@ -103,6 +107,9 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
+ if (!devname)
+ devname = dev_name(dev);
+
rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
if (rc < 0) {
devres_free(dr);
@@ -137,3 +144,57 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
free_irq(irq, dev_id);
}
EXPORT_SYMBOL(devm_free_irq);
+
+struct irq_desc_devres {
+ unsigned int from;
+ unsigned int cnt;
+};
+
+static void devm_irq_desc_release(struct device *dev, void *res)
+{
+ struct irq_desc_devres *this = res;
+
+ irq_free_descs(this->from, this->cnt);
+}
+
+/**
+ * __devm_irq_alloc_descs - Allocate and initialize a range of irq descriptors
+ * for a managed device
+ * @dev: Device to allocate the descriptors for
+ * @irq: Allocate for specific irq number if irq >= 0
+ * @from: Start the search from this irq number
+ * @cnt: Number of consecutive irqs to allocate
+ * @node: Preferred node on which the irq descriptor should be allocated
+ * @owner: Owning module (can be NULL)
+ * @affinity: Optional pointer to an affinity mask array of size @cnt
+ * which hints where the irq descriptors should be allocated
+ * and which default affinities to use
+ *
+ * Returns the first irq number or error code.
+ *
+ * Note: Use the provided wrappers (devm_irq_alloc_desc*) for simplicity.
+ */
+int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
+ unsigned int cnt, int node, struct module *owner,
+ const struct cpumask *affinity)
+{
+ struct irq_desc_devres *dr;
+ int base;
+
+ dr = devres_alloc(devm_irq_desc_release, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ base = __irq_alloc_descs(irq, from, cnt, node, owner, affinity);
+ if (base < 0) {
+ devres_free(dr);
+ return base;
+ }
+
+ dr->from = base;
+ dr->cnt = cnt;
+ devres_add(dev, dr);
+
+ return base;
+}
+EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c0a0ae43521..b59e6768c5e9 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1346,6 +1346,30 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
+static void __irq_domain_activate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (irq_data->parent_data)
+ __irq_domain_activate_irq(irq_data->parent_data);
+ if (domain->ops->activate)
+ domain->ops->activate(domain, irq_data);
+ }
+}
+
+static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
+ if (irq_data->parent_data)
+ __irq_domain_deactivate_irq(irq_data->parent_data);
+ }
+}
+
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
* interrupt
@@ -1356,13 +1380,9 @@ EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
*/
void irq_domain_activate_irq(struct irq_data *irq_data)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
-
- if (irq_data->parent_data)
- irq_domain_activate_irq(irq_data->parent_data);
- if (domain->ops->activate)
- domain->ops->activate(domain, irq_data);
+ if (!irqd_is_activated(irq_data)) {
+ __irq_domain_activate_irq(irq_data);
+ irqd_set_activated(irq_data);
}
}
@@ -1376,13 +1396,9 @@ void irq_domain_activate_irq(struct irq_data *irq_data)
*/
void irq_domain_deactivate_irq(struct irq_data *irq_data)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
-
- if (domain->ops->deactivate)
- domain->ops->deactivate(domain, irq_data);
- if (irq_data->parent_data)
- irq_domain_deactivate_irq(irq_data->parent_data);
+ if (irqd_is_activated(irq_data)) {
+ __irq_domain_deactivate_irq(irq_data);
+ irqd_clr_activated(irq_data);
}
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index feaa813b84a9..c53edad7b459 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -487,6 +487,8 @@ int show_interrupts(struct seq_file *p, void *v)
}
if (desc->irq_data.domain)
seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+ else
+ seq_printf(p, " %*s", prec, "");
#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
#endif
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 5707f97a3e6a..061ba7eed4ed 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -175,7 +175,9 @@ out:
static inline int bad_action_ret(irqreturn_t action_ret)
{
- if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+ unsigned int r = action_ret;
+
+ if (likely(r <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
return 0;
return 1;
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93ad6c1fb9b6..a9b8cf500591 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -182,6 +182,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+void static_key_deferred_flush(struct static_key_deferred *key)
+{
+ STATIC_KEY_CHECK_USE();
+ flush_delayed_work(&key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 43460104f119..ebb4dadca66b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -149,9 +149,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
struct kprobe_insn_page *kip;
kprobe_opcode_t *slot = NULL;
+ /* Since the slot array is not protected by rcu, we need a mutex */
mutex_lock(&c->mutex);
retry:
- list_for_each_entry(kip, &c->pages, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
if (kip->nused < slots_per_page(c)) {
int i;
for (i = 0; i < slots_per_page(c); i++) {
@@ -159,6 +161,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->slot_used[i] = SLOT_USED;
kip->nused++;
slot = kip->insns + (i * c->insn_size);
+ rcu_read_unlock();
goto out;
}
}
@@ -167,6 +170,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
WARN_ON(1);
}
}
+ rcu_read_unlock();
/* If there are any garbage slots, collect it and try again. */
if (c->nr_garbage && collect_garbage_slots(c) == 0)
@@ -193,7 +197,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->nused = 1;
kip->ngarbage = 0;
kip->cache = c;
- list_add(&kip->list, &c->pages);
+ list_add_rcu(&kip->list, &c->pages);
slot = kip->insns;
out:
mutex_unlock(&c->mutex);
@@ -213,7 +217,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
* next time somebody inserts a probe.
*/
if (!list_is_singular(&kip->list)) {
- list_del(&kip->list);
+ list_del_rcu(&kip->list);
+ synchronize_rcu();
kip->cache->free(kip->insns);
kfree(kip);
}
@@ -235,8 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
for (i = 0; i < slots_per_page(c); i++) {
- if (kip->slot_used[i] == SLOT_DIRTY &&
- collect_one_slot(kip, i))
+ if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
break;
}
}
@@ -248,29 +252,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c,
kprobe_opcode_t *slot, int dirty)
{
struct kprobe_insn_page *kip;
+ long idx;
mutex_lock(&c->mutex);
- list_for_each_entry(kip, &c->pages, list) {
- long idx = ((long)slot - (long)kip->insns) /
- (c->insn_size * sizeof(kprobe_opcode_t));
- if (idx >= 0 && idx < slots_per_page(c)) {
- WARN_ON(kip->slot_used[idx] != SLOT_USED);
- if (dirty) {
- kip->slot_used[idx] = SLOT_DIRTY;
- kip->ngarbage++;
- if (++c->nr_garbage > slots_per_page(c))
- collect_garbage_slots(c);
- } else
- collect_one_slot(kip, idx);
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ idx = ((long)slot - (long)kip->insns) /
+ (c->insn_size * sizeof(kprobe_opcode_t));
+ if (idx >= 0 && idx < slots_per_page(c))
goto out;
- }
}
- /* Could not free this slot. */
+ /* Could not find this slot. */
WARN_ON(1);
+ kip = NULL;
out:
+ rcu_read_unlock();
+ /* Mark and sweep: this may sleep */
+ if (kip) {
+ /* Check double free */
+ WARN_ON(kip->slot_used[idx] != SLOT_USED);
+ if (dirty) {
+ kip->slot_used[idx] = SLOT_DIRTY;
+ kip->ngarbage++;
+ if (++c->nr_garbage > slots_per_page(c))
+ collect_garbage_slots(c);
+ } else {
+ collect_one_slot(kip, idx);
+ }
+ }
mutex_unlock(&c->mutex);
}
+/*
+ * Check given address is on the page of kprobe instruction slots.
+ * This will be used for checking whether the address on a stack
+ * is on a text area or not.
+ */
+bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
+{
+ struct kprobe_insn_page *kip;
+ bool ret = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if (addr >= (unsigned long)kip->insns &&
+ addr < (unsigned long)kip->insns + PAGE_SIZE) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2318fba86277..8461a4372e8a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -850,7 +850,6 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
list_add(&work->node, &worker->delayed_work_list);
work->worker = worker;
- timer_stats_timer_set_start_info(&dwork->timer);
timer->expires = jiffies + delay;
add_timer(timer);
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index bf6072524756..9812e5dd409e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4412,13 +4412,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
/* Note: the following can be executed concurrently, so be careful. */
printk("\n");
- printk("===============================\n");
- printk("[ INFO: suspicious RCU usage. ]\n");
+ pr_err("===============================\n");
+ pr_err("[ ERR: suspicious RCU usage. ]\n");
print_kernel_ident();
- printk("-------------------------------\n");
- printk("%s:%d %s!\n", file, line, s);
- printk("\nother info that might help us debug this:\n\n");
- printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_err("-------------------------------\n");
+ pr_err("%s:%d %s!\n", file, line, s);
+ pr_err("\nother info that might help us debug this:\n\n");
+ pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: !rcu_is_watching()
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 9bffedd82884..28350dc8ecbb 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -852,6 +852,10 @@ static void lock_torture_cleanup(void)
else
lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+
+ kfree(cxt.lwsa);
+ kfree(cxt.lrsa);
+
end:
torture_cleanup_end();
}
@@ -997,6 +1001,8 @@ static int __init lock_torture_init(void)
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ kfree(writer_tasks);
+ writer_tasks = NULL;
firsterr = -ENOMEM;
goto unwind;
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 705e06fe5e6c..ad2d9e22697b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -929,6 +929,20 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+void __sched
+mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
static inline int
ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
@@ -1099,6 +1113,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_lock_killable);
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token;
+
+ token = io_schedule_prepare();
+ mutex_lock(lock);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io);
+
static noinline void __sched
__mutex_lock_slowpath(struct mutex *lock)
{
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727a56e9..9f9284f37f8d 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/membarrier.h>
+#include <linux/tick.h>
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
@@ -51,6 +52,9 @@
*/
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
{
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -ENOSYS;
if (unlikely(flags))
return -EINVAL;
switch (cmd) {
diff --git a/kernel/module.c b/kernel/module.c
index 5088784c0cf9..3d8f126208e3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -389,16 +389,16 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const struct kernel_symbol __start___ksymtab_gpl_future[];
extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const unsigned long __start___kcrctab[];
-extern const unsigned long __start___kcrctab_gpl[];
-extern const unsigned long __start___kcrctab_gpl_future[];
+extern const s32 __start___kcrctab[];
+extern const s32 __start___kcrctab_gpl[];
+extern const s32 __start___kcrctab_gpl_future[];
#ifdef CONFIG_UNUSED_SYMBOLS
extern const struct kernel_symbol __start___ksymtab_unused[];
extern const struct kernel_symbol __stop___ksymtab_unused[];
extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const unsigned long __start___kcrctab_unused[];
-extern const unsigned long __start___kcrctab_unused_gpl[];
+extern const s32 __start___kcrctab_unused[];
+extern const s32 __start___kcrctab_unused_gpl[];
#endif
#ifndef CONFIG_MODVERSIONS
@@ -497,7 +497,7 @@ struct find_symbol_arg {
/* Output */
struct module *owner;
- const unsigned long *crc;
+ const s32 *crc;
const struct kernel_symbol *sym;
};
@@ -563,7 +563,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
- const unsigned long **crc,
+ const s32 **crc,
bool gplok,
bool warn)
{
@@ -1145,7 +1145,7 @@ static size_t module_flags_taint(struct module *mod, char *buf)
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
if (taint_flags[i].module && test_bit(i, &mod->taints))
- buf[l++] = taint_flags[i].true;
+ buf[l++] = taint_flags[i].c_true;
}
return l;
@@ -1249,23 +1249,17 @@ static int try_to_force_load(struct module *mod, const char *reason)
}
#ifdef CONFIG_MODVERSIONS
-/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
-static unsigned long maybe_relocated(unsigned long crc,
- const struct module *crc_owner)
+
+static u32 resolve_rel_crc(const s32 *crc)
{
-#ifdef ARCH_RELOCATES_KCRCTAB
- if (crc_owner == NULL)
- return crc - (unsigned long)reloc_start;
-#endif
- return crc;
+ return *(u32 *)((void *)crc + *crc);
}
static int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
struct module *mod,
- const unsigned long *crc,
- const struct module *crc_owner)
+ const s32 *crc)
{
unsigned int i, num_versions;
struct modversion_info *versions;
@@ -1283,13 +1277,19 @@ static int check_version(Elf_Shdr *sechdrs,
/ sizeof(struct modversion_info);
for (i = 0; i < num_versions; i++) {
+ u32 crcval;
+
if (strcmp(versions[i].name, symname) != 0)
continue;
- if (versions[i].crc == maybe_relocated(*crc, crc_owner))
+ if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
+ crcval = resolve_rel_crc(crc);
+ else
+ crcval = *crc;
+ if (versions[i].crc == crcval)
return 1;
- pr_debug("Found checksum %lX vs module %lX\n",
- maybe_relocated(*crc, crc_owner), versions[i].crc);
+ pr_debug("Found checksum %X vs module %lX\n",
+ crcval, versions[i].crc);
goto bad_version;
}
@@ -1307,7 +1307,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
unsigned int versindex,
struct module *mod)
{
- const unsigned long *crc;
+ const s32 *crc;
/*
* Since this should be found in kernel (which can't be removed), no
@@ -1321,8 +1321,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
}
preempt_enable();
return check_version(sechdrs, versindex,
- VMLINUX_SYMBOL_STR(module_layout), mod, crc,
- NULL);
+ VMLINUX_SYMBOL_STR(module_layout), mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1340,8 +1339,7 @@ static inline int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
struct module *mod,
- const unsigned long *crc,
- const struct module *crc_owner)
+ const s32 *crc)
{
return 1;
}
@@ -1368,7 +1366,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
{
struct module *owner;
const struct kernel_symbol *sym;
- const unsigned long *crc;
+ const s32 *crc;
int err;
/*
@@ -1383,8 +1381,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
if (!sym)
goto unlock;
- if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
- owner)) {
+ if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index c51edaa04fce..08aa88dde7de 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -249,7 +249,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- pr_emerg("Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
@@ -355,7 +355,7 @@ const char *print_tainted(void)
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
const struct taint_flag *t = &taint_flags[i];
*s++ = test_bit(i, &tainted_mask) ?
- t->true : t->false;
+ t->c_true : t->c_false;
}
*s = 0;
} else
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index df9e8e9e0be7..eef2ce968636 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -151,8 +151,12 @@ out:
static void delayed_free_pidns(struct rcu_head *p)
{
- kmem_cache_free(pid_ns_cachep,
- container_of(p, struct pid_namespace, rcu));
+ struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
+
+ dec_pid_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+
+ kmem_cache_free(pid_ns_cachep, ns);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
@@ -162,8 +166,6 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
- dec_pid_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
call_rcu(&ns->rcu, delayed_free_pidns);
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f67ceb7768b8..15e6baef5c73 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -46,7 +46,7 @@ static const char * const mem_sleep_labels[] = {
const char *mem_sleep_states[PM_SUSPEND_MAX];
suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
-suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
+static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
unsigned int pm_suspend_global_flags;
EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
@@ -168,7 +168,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
}
if (valid_state(PM_SUSPEND_MEM)) {
mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
- if (mem_sleep_default >= PM_SUSPEND_MEM)
+ if (mem_sleep_default == PM_SUSPEND_MEM)
mem_sleep_current = PM_SUSPEND_MEM;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8b2696420abb..4ba3d34938c0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1516,7 +1516,7 @@ static void call_console_drivers(int level,
{
struct console *con;
- trace_console(text, len);
+ trace_console_rcuidle(text, len);
if (!console_drivers)
return;
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7d4c3d..0d6ff3e471be 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
#define TPS(x) tracepoint_string(x)
void rcu_early_boot_tests(void);
+void rcu_test_sync_prims(void);
/*
* This function really isn't for public consumption, but RCU is special in
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 87c51225ceec..d81345be730e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -564,10 +564,25 @@ static void srcu_torture_stats(void)
pr_alert("%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
long c0, c1;
+ struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
- c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
- c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
+ u0 = counts->unlock_count[!idx];
+ u1 = counts->unlock_count[idx];
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = counts->lock_count[!idx];
+ l1 = counts->lock_count[idx];
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
pr_cont("\n");
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9b9cdd549caa..e773129c8b08 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
rcu_batch_init(&sp->batch_check1);
rcu_batch_init(&sp->batch_done);
INIT_DELAYED_WORK(&sp->work, process_srcu);
- sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+ sp->per_cpu_ref = alloc_percpu(struct srcu_array);
return sp->per_cpu_ref ? 0 : -ENOMEM;
}
@@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
- * Returns approximate total of the readers' ->seq[] values for the
+ * Returns approximate total of the readers' ->lock_count[] values for the
* rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
{
int cpu;
unsigned long sum = 0;
- unsigned long t;
for_each_possible_cpu(cpu) {
- t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
- sum += t;
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->lock_count[idx]);
}
return sum;
}
/*
- * Returns approximate number of readers active on the specified rank
- * of the per-CPU ->c[] counters.
+ * Returns approximate total of the readers' ->unlock_count[] values for the
+ * rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
{
int cpu;
unsigned long sum = 0;
- unsigned long t;
for_each_possible_cpu(cpu) {
- t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
- sum += t;
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->unlock_count[idx]);
}
return sum;
}
/*
* Return true if the number of pre-existing readers is determined to
- * be stably zero. An example unstable zero can occur if the call
- * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
- * but due to task migration, sees the corresponding __srcu_read_unlock()
- * decrement. This can happen because srcu_readers_active_idx() takes
- * time to sum the array, and might in fact be interrupted or preempted
- * partway through the summation.
+ * be zero.
*/
static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
{
- unsigned long seq;
+ unsigned long unlocks;
- seq = srcu_readers_seq_idx(sp, idx);
+ unlocks = srcu_readers_unlock_idx(sp, idx);
/*
- * The following smp_mb() A pairs with the smp_mb() B located in
- * __srcu_read_lock(). This pairing ensures that if an
- * __srcu_read_lock() increments its counter after the summation
- * in srcu_readers_active_idx(), then the corresponding SRCU read-side
- * critical section will see any changes made prior to the start
- * of the current SRCU grace period.
+ * Make sure that a lock is always counted if the corresponding unlock
+ * is counted. Needs to be a smp_mb() as the read side may contain a
+ * read from a variable that is written to before the synchronize_srcu()
+ * in the write side. In this case smp_mb()s A and B act like the store
+ * buffering pattern.
*
- * Also, if the above call to srcu_readers_seq_idx() saw the
- * increment of ->seq[], then the call to srcu_readers_active_idx()
- * must see the increment of ->c[].
+ * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
+ * synchronize_srcu() from being executed before the grace period ends.
*/
smp_mb(); /* A */
/*
- * Note that srcu_readers_active_idx() can incorrectly return
- * zero even though there is a pre-existing reader throughout.
- * To see this, suppose that task A is in a very long SRCU
- * read-side critical section that started on CPU 0, and that
- * no other reader exists, so that the sum of the counters
- * is equal to one. Then suppose that task B starts executing
- * srcu_readers_active_idx(), summing up to CPU 1, and then that
- * task C starts reading on CPU 0, so that its increment is not
- * summed, but finishes reading on CPU 2, so that its decrement
- * -is- summed. Then when task B completes its sum, it will
- * incorrectly get zero, despite the fact that task A has been
- * in its SRCU read-side critical section the whole time.
- *
- * We therefore do a validation step should srcu_readers_active_idx()
- * return zero.
- */
- if (srcu_readers_active_idx(sp, idx) != 0)
- return false;
-
- /*
- * The remainder of this function is the validation step.
- * The following smp_mb() D pairs with the smp_mb() C in
- * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
- * by srcu_readers_active_idx() above, then any destructive
- * operation performed after the grace period will happen after
- * the corresponding SRCU read-side critical section.
+ * If the locks are the same as the unlocks, then there must have
+ * been no readers on this index at some time in between. This does not
+ * mean that there are no more readers, as one could have read the
+ * current index but not have incremented the lock counter yet.
*
- * Note that there can be at most NR_CPUS worth of readers using
- * the old index, which is not enough to overflow even a 32-bit
- * integer. (Yes, this does mean that systems having more than
- * a billion or so CPUs need to be 64-bit systems.) Therefore,
- * the sum of the ->seq[] counters cannot possibly overflow.
- * Therefore, the only way that the return values of the two
- * calls to srcu_readers_seq_idx() can be equal is if there were
- * no increments of the corresponding rank of ->seq[] counts
- * in the interim. But the missed-increment scenario laid out
- * above includes an increment of the ->seq[] counter by
- * the corresponding __srcu_read_lock(). Therefore, if this
- * scenario occurs, the return values from the two calls to
- * srcu_readers_seq_idx() will differ, and thus the validation
- * step below suffices.
+ * Possible bug: There is no guarantee that there haven't been ULONG_MAX
+ * increments of ->lock_count[] since the unlocks were counted, meaning
+ * that this could return true even if there are still active readers.
+ * Since there are no memory barriers around srcu_flip(), the CPU is not
+ * required to increment ->completed before running
+ * srcu_readers_unlock_idx(), which means that there could be an
+ * arbitrarily large number of critical sections that execute after
+ * srcu_readers_unlock_idx() but use the old value of ->completed.
*/
- smp_mb(); /* D */
-
- return srcu_readers_seq_idx(sp, idx) == seq;
+ return srcu_readers_lock_idx(sp, idx) == unlocks;
}
/**
@@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
- sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->lock_count[0]);
+ sum += READ_ONCE(cpuc->lock_count[1]);
+ sum -= READ_ONCE(cpuc->unlock_count[0]);
+ sum -= READ_ONCE(cpuc->unlock_count[1]);
}
return sum;
}
@@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
int idx;
idx = READ_ONCE(sp->completed) & 0x1;
- __this_cpu_inc(sp->per_cpu_ref->c[idx]);
+ __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
return idx;
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_dec(sp->per_cpu_ref->c[idx]);
+ this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -349,12 +315,21 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
/*
* Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->c[] and ->seq[] arrays. This allows
+ * use the other rank of the ->(un)lock_count[] arrays. This allows
* us to wait for pre-existing readers in a starvation-free manner.
*/
static void srcu_flip(struct srcu_struct *sp)
{
- sp->completed++;
+ WRITE_ONCE(sp->completed, sp->completed + 1);
+
+ /*
+ * Ensure that if the updater misses an __srcu_read_unlock()
+ * increment, that task's next __srcu_read_lock() will see the
+ * above counter update. Note that both this memory barrier
+ * and the one in srcu_readers_active_idx_check() provide the
+ * guarantee for __srcu_read_lock().
+ */
+ smp_mb(); /* D */ /* Pairs with C. */
}
/*
@@ -392,6 +367,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
head->next = NULL;
head->func = func;
spin_lock_irqsave(&sp->queue_lock, flags);
+ smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
rcu_batch_queue(&sp->batch_queue, head);
if (!sp->running) {
sp->running = true;
@@ -425,6 +401,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
head->next = NULL;
head->func = wakeme_after_rcu;
spin_lock_irq(&sp->queue_lock);
+ smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
if (!sp->running) {
/* steal the processing owner */
sp->running = true;
@@ -444,8 +421,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
spin_unlock_irq(&sp->queue_lock);
}
- if (!done)
+ if (!done) {
wait_for_completion(&rcu.completion);
+ smp_mb(); /* Caller's later accesses after GP. */
+ }
+
}
/**
@@ -613,7 +593,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
/*
* Invoke a limited number of SRCU callbacks that have passed through
* their grace period. If there are more to do, SRCU will reschedule
- * the workqueue.
+ * the workqueue. Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
*/
static void srcu_invoke_callbacks(struct srcu_struct *sp)
{
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1898559e6b60..fa6a48d3917b 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -41,8 +41,6 @@
/* Forward declarations for tiny_plugin.h. */
struct rcu_ctrlblk;
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static void rcu_process_callbacks(struct softirq_action *unused);
static void __call_rcu(struct rcu_head *head,
rcu_callback_t func,
struct rcu_ctrlblk *rcp);
@@ -185,9 +183,6 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
* benefits of doing might_sleep() to reduce latency.)
*
* Cool, huh? (Due to Josh Triplett.)
- *
- * But we want to make this a static inline later. The cond_resched()
- * currently makes this problematic.
*/
void synchronize_sched(void)
{
@@ -195,7 +190,6 @@ void synchronize_sched(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_sched() in RCU read-side critical section");
- cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 196f0302e2f4..c64b827ecbca 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
/*
* During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously.
+ * invoked, we start taking RCU lockdep issues seriously. Note that unlike
+ * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
+ * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
+ * The reason for this is that Tiny RCU does not need kthreads, so does
+ * not have to care about the fact that the scheduler is half-initialized
+ * at a certain phase of the boot process.
*/
void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
}
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 96c52e43f7ca..d80e0d2f68c6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
int sysctl_panic_on_rcu_stall __read_mostly;
/*
- * The rcu_scheduler_active variable transitions from zero to one just
- * before the first task is spawned. So when this variable is zero, RCU
- * can assume that there is but one task, allowing RCU to (for example)
+ * The rcu_scheduler_active variable is initialized to the value
+ * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
+ * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
+ * RCU can assume that there is but one task, allowing RCU to (for example)
* optimize synchronize_rcu() to a simple barrier(). When this variable
- * is one, RCU must actually do all the hard work required to detect real
- * grace periods. This variable is also used to suppress boot-time false
- * positives from lockdep-RCU error checking.
+ * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
+ * to detect real grace periods. This variable is also used to suppress
+ * boot-time false positives from lockdep-RCU error checking. Finally, it
+ * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
+ * is fully initialized, including all of its kthreads having been spawned.
*/
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -278,6 +281,116 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state.
+ */
+static void rcu_dynticks_eqs_enter(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int special;
+
+ /*
+ * CPUs seeing atomic_inc_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ special = atomic_inc_return(&rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state.
+ */
+static void rcu_dynticks_eqs_exit(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int special;
+
+ /*
+ * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ special = atomic_inc_return(&rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+}
+
+/*
+ * Reset the current CPU's ->dynticks counter to indicate that the
+ * newly onlined CPU is no longer in an extended quiescent state.
+ * This will either leave the counter unchanged, or increment it
+ * to the next non-quiescent value.
+ *
+ * The non-atomic test/increment sequence works because the upper bits
+ * of the ->dynticks counter are manipulated only by the corresponding CPU,
+ * or when the corresponding CPU is offline.
+ */
+static void rcu_dynticks_eqs_online(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ if (atomic_read(&rdtp->dynticks) & 0x1)
+ return;
+ atomic_add(0x1, &rdtp->dynticks);
+}
+
+/*
+ * Is the current CPU in an extended quiescent state?
+ *
+ * No ordering, as we are sampling CPU-local information.
+ */
+bool rcu_dynticks_curr_cpu_in_eqs(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ return !(atomic_read(&rdtp->dynticks) & 0x1);
+}
+
+/*
+ * Snapshot the ->dynticks counter with full ordering so as to allow
+ * stable comparison of this counter with past and future snapshots.
+ */
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
+{
+ int snap = atomic_add_return(0, &rdtp->dynticks);
+
+ return snap;
+}
+
+/*
+ * Return true if the snapshot returned from rcu_dynticks_snap()
+ * indicates that RCU is in an extended quiescent state.
+ */
+static bool rcu_dynticks_in_eqs(int snap)
+{
+ return !(snap & 0x1);
+}
+
+/*
+ * Return true if the CPU corresponding to the specified rcu_dynticks
+ * structure has spent some time in an extended quiescent state since
+ * rcu_dynticks_snap() returned the specified snapshot.
+ */
+static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
+{
+ return snap != rcu_dynticks_snap(rdtp);
+}
+
+/*
+ * Do a double-increment of the ->dynticks counter to emulate a
+ * momentary idle-CPU quiescent state.
+ */
+static void rcu_dynticks_momentary_idle(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int special = atomic_add_return(2, &rdtp->dynticks);
+
+ /* It is illegal to call this from idle state. */
+ WARN_ON_ONCE(!(special & 0x1));
+}
+
DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
@@ -297,7 +410,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
static void rcu_momentary_dyntick_idle(void)
{
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp;
int resched_mask;
struct rcu_state *rsp;
@@ -324,10 +436,7 @@ static void rcu_momentary_dyntick_idle(void)
* quiescent state, with no need for this CPU to do anything
* further.
*/
- rdtp = this_cpu_ptr(&rcu_dynticks);
- smp_mb__before_atomic(); /* Earlier stuff before QS. */
- atomic_add(2, &rdtp->dynticks); /* QS. */
- smp_mb__after_atomic(); /* Later stuff after QS. */
+ rcu_dynticks_momentary_idle();
break;
}
}
@@ -608,7 +717,7 @@ static int
cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
{
return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
- rdp->nxttail[RCU_DONE_TAIL] != NULL;
+ rdp->nxttail[RCU_NEXT_TAIL] != NULL;
}
/*
@@ -670,7 +779,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -689,12 +798,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
do_nocb_deferred_wakeup(rdp);
}
rcu_prepare_for_idle();
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_eqs_enter();
rcu_dynticks_task_enter();
/*
@@ -823,15 +927,10 @@ void rcu_irq_exit_irqson(void)
*/
static void rcu_eqs_exit_common(long long oldval, int user)
{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
rcu_dynticks_task_exit();
- smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(atomic_read(&rdtp->dynticks) & 0x1));
+ rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -977,12 +1076,8 @@ void rcu_nmi_enter(void)
* to be in the outermost NMI handler that interrupted an RCU-idle
* period (observation due to Andy Lutomirski).
*/
- if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
- smp_mb__before_atomic(); /* Force delay from prior write. */
- atomic_inc(&rdtp->dynticks);
- /* atomic_inc() before later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ if (rcu_dynticks_curr_cpu_in_eqs()) {
+ rcu_dynticks_eqs_exit();
incby = 1;
}
rdtp->dynticks_nmi_nesting += incby;
@@ -1007,7 +1102,7 @@ void rcu_nmi_exit(void)
* to us!)
*/
WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
/*
* If the nesting level is not 1, the CPU wasn't RCU-idle, so
@@ -1020,11 +1115,7 @@ void rcu_nmi_exit(void)
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
rdtp->dynticks_nmi_nesting = 0;
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* Force delay to next write. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_eqs_enter();
}
/**
@@ -1037,7 +1128,7 @@ void rcu_nmi_exit(void)
*/
bool notrace __rcu_is_watching(void)
{
- return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+ return !rcu_dynticks_curr_cpu_in_eqs();
}
/**
@@ -1120,9 +1211,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
static int dyntick_save_progress_counter(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
- rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
rcu_sysidle_check_cpu(rdp, isidle, maxj);
- if ((rdp->dynticks_snap & 0x1) == 0) {
+ if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
rdp->mynode->gpnum))
@@ -1141,12 +1232,10 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
- unsigned int curr;
+ unsigned long jtsq;
int *rcrmp;
- unsigned int snap;
-
- curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
- snap = (unsigned int)rdp->dynticks_snap;
+ unsigned long rjtsc;
+ struct rcu_node *rnp;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -1156,27 +1245,39 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
rdp->dynticks_fqs++;
return 1;
}
+ /* Compute and saturate jiffies_till_sched_qs. */
+ jtsq = jiffies_till_sched_qs;
+ rjtsc = rcu_jiffies_till_stall_check();
+ if (jtsq > rjtsc / 2) {
+ WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
+ jtsq = rjtsc / 2;
+ } else if (jtsq < 1) {
+ WRITE_ONCE(jiffies_till_sched_qs, 1);
+ jtsq = 1;
+ }
+
/*
- * Check for the CPU being offline, but only if the grace period
- * is old enough. We don't need to worry about the CPU changing
- * state: If we see it offline even once, it has been through a
- * quiescent state.
- *
- * The reason for insisting that the grace period be at least
- * one jiffy old is that CPUs that are not quite online and that
- * have just gone offline can still execute RCU read-side critical
- * sections.
+ * Has this CPU encountered a cond_resched_rcu_qs() since the
+ * beginning of the grace period? For this to be the case,
+ * the CPU has to have noticed the current grace period. This
+ * might not be the case for nohz_full CPUs looping in the kernel.
*/
- if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
- return 0; /* Grace period is not old enough. */
- barrier();
- if (cpu_is_offline(rdp->cpu)) {
+ rnp = rdp->mynode;
+ if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
+ READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) &&
+ READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+ return 1;
+ }
+
+ /* Check for the CPU being offline. */
+ if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
rdp->offline_fqs++;
return 1;
@@ -1204,9 +1305,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* warning delay.
*/
rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
- if (ULONG_CMP_GE(jiffies,
- rdp->rsp->gp_start + jiffies_till_sched_qs) ||
- ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ if (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
+ time_after(jiffies, rdp->rsp->jiffies_resched)) {
if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
WRITE_ONCE(rdp->cond_resched_completed,
READ_ONCE(rdp->mynode->completed));
@@ -1217,11 +1317,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
- /* And if it has been a really long time, kick the CPU as well. */
- if (ULONG_CMP_GE(jiffies,
- rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
- ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
- resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ /*
+ * If more than halfway to RCU CPU stall-warning time, do
+ * a resched_cpu() to try to loosen things up a bit.
+ */
+ if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+ resched_cpu(rdp->cpu);
return 0;
}
@@ -1274,7 +1375,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
}
/*
- * Dump stacks of all tasks running on stalled CPUs.
+ * Dump stacks of all tasks running on stalled CPUs. First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps. The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
*/
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
@@ -1284,11 +1388,10 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (rnp->qsmask != 0) {
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ if (!trigger_single_cpu_backtrace(cpu))
dump_cpu_task(cpu);
- }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1376,6 +1479,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
(long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected) {
rcu_dump_cpu_stacks(rsp);
+
+ /* Complain about tasks blocking the grace period. */
+ rcu_print_detail_task_stall(rsp);
} else {
if (READ_ONCE(rsp->gpnum) != gpnum ||
READ_ONCE(rsp->completed) == gpnum) {
@@ -1392,9 +1498,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
}
}
- /* Complain about tasks blocking the grace period. */
- rcu_print_detail_task_stall(rsp);
-
rcu_check_gp_kthread_starvation(rsp);
panic_on_rcu_stall();
@@ -2464,10 +2567,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
rnp = rdp->mynode;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if ((rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
- rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
- rdp->gpwrap) {
+ if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum ||
+ rnp->completed == rnp->gpnum || rdp->gpwrap) {
/*
* The grace period in which this quiescent state was
@@ -2522,8 +2623,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
+ if (rdp->cpu_no_qs.b.norm)
return;
/*
@@ -3477,9 +3577,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
rdp->n_rp_core_needs_qs++;
- } else if (rdp->core_needs_qs &&
- (!rdp->cpu_no_qs.b.norm ||
- rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
+ } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
rdp->n_rp_report_qs++;
return 1;
}
@@ -3745,7 +3843,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
- WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks)));
rdp->cpu = cpu;
rdp->rsp = rsp;
rcu_boot_init_nocb_percpu_data(rdp);
@@ -3762,7 +3860,6 @@ static void
rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -3775,8 +3872,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_sysidle_init_percpu_data(rdp->dynticks);
- atomic_set(&rdp->dynticks->dynticks,
- (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+ rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
@@ -3785,7 +3881,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
* of the next grace period.
*/
rnp = rdp->mynode;
- mask = rdp->grpmask;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
if (!rdp->beenonline)
WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
@@ -3869,7 +3964,7 @@ void rcu_cpu_starting(unsigned int cpu)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp) {
- rdp = this_cpu_ptr(rsp->rda);
+ rdp = per_cpu_ptr(rsp->rda, cpu);
rnp = rdp->mynode;
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -3980,18 +4075,22 @@ static int __init rcu_spawn_gp_kthread(void)
early_initcall(rcu_spawn_gp_kthread);
/*
- * This function is invoked towards the end of the scheduler's initialization
- * process. Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system). After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections. This function also enables RCU lockdep checking.
+ * This function is invoked towards the end of the scheduler's
+ * initialization process. Before this is called, the idle task might
+ * contain synchronous grace-period primitives (during which time, this idle
+ * task is booting the system, and such primitives are no-ops). After this
+ * function is called, any synchronous grace-period primitives are run as
+ * expedited, with the requesting task driving the grace period forward.
+ * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * runtime RCU functionality.
*/
void rcu_scheduler_starting(void)
{
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_INIT;
+ rcu_test_sync_prims();
}
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fe98dd24adf8..b60f2b6caa14 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -521,7 +521,6 @@ struct rcu_state {
struct mutex exp_mutex; /* Serialize expedited GP. */
struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
- atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
@@ -595,6 +594,8 @@ extern struct rcu_state rcu_bh_state;
extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
@@ -688,18 +689,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#endif /* #ifdef CONFIG_RCU_TRACE */
/*
- * Place this after a lock-acquisition primitive to guarantee that
- * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies
- * if the UNLOCK and LOCK are executed by the same CPU or if the
- * UNLOCK and LOCK operate on the same lock variable.
- */
-#ifdef CONFIG_PPC
-#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
-#else /* #ifdef CONFIG_PPC */
-#define smp_mb__after_unlock_lock() do { } while (0)
-#endif /* #else #ifdef CONFIG_PPC */
-
-/*
* Wrappers for the rcu_node::lock acquire and release.
*
* Because the rcu_nodes form a tree, the tree traversal locking will observe
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d3053e99fdb6..a7b639ccd46e 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,16 +20,26 @@
* Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
-/* Wrapper functions for expedited grace periods. */
+/*
+ * Record the start of an expedited grace period.
+ */
static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
{
rcu_seq_start(&rsp->expedited_sequence);
}
+
+/*
+ * Record the end of an expedited grace period.
+ */
static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
{
rcu_seq_end(&rsp->expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
+
+/*
+ * Take a snapshot of the expedited-grace-period counter.
+ */
static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
{
unsigned long s;
@@ -39,6 +49,12 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
return s;
}
+
+/*
+ * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true
+ * if a full expedited grace period has elapsed since that snapshot
+ * was taken.
+ */
static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
{
return rcu_seq_done(&rsp->expedited_sequence, s);
@@ -356,12 +372,11 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
mask_ofl_test = 0;
for_each_leaf_node_possible_cpu(rnp, cpu) {
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
rdp->exp_dynticks_snap =
- atomic_add_return(0, &rdtp->dynticks);
+ rcu_dynticks_snap(rdp->dynticks);
if (raw_smp_processor_id() == cpu ||
- !(rdp->exp_dynticks_snap & 0x1) ||
+ rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
!(rnp->qsmaskinitnext & rdp->grpmask))
mask_ofl_test |= rdp->grpmask;
}
@@ -380,13 +395,12 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
if (!(mask_ofl_ipi & mask))
continue;
retry_ipi:
- if (atomic_add_return(0, &rdtp->dynticks) !=
- rdp->exp_dynticks_snap) {
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+ rdp->exp_dynticks_snap)) {
mask_ofl_test |= mask;
continue;
}
@@ -532,18 +546,28 @@ struct rcu_exp_work {
};
/*
+ * Common code to drive an expedited grace period forward, used by
+ * workqueues and mid-boot-time tasks.
+ */
+static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
+ smp_call_func_t func, unsigned long s)
+{
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ sync_rcu_exp_select_cpus(rsp, func);
+
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(rsp, s);
+}
+
+/*
* Work-queue handler to drive an expedited grace period forward.
*/
static void wait_rcu_exp_gp(struct work_struct *wp)
{
struct rcu_exp_work *rewp;
- /* Initialize the rcu_node tree in preparation for the wait. */
rewp = container_of(wp, struct rcu_exp_work, rew_work);
- sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
-
- /* Wait and clean up, including waking everyone. */
- rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
+ rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
}
/*
@@ -569,12 +593,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */
- /* Marshall arguments and schedule the expedited grace period. */
- rew.rew_func = func;
- rew.rew_rsp = rsp;
- rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- schedule_work(&rew.rew_work);
+ /* Ensure that load happens before action based on it. */
+ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ /* Direct call during scheduler init and early_initcalls(). */
+ rcu_exp_sel_wait_wake(rsp, func, s);
+ } else {
+ /* Marshall arguments & schedule the expedited grace period. */
+ rew.rew_func = func;
+ rew.rew_rsp = rsp;
+ rew.rew_s = s;
+ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+ schedule_work(&rew.rew_work);
+ }
/* Wait for expedited grace period to complete. */
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
@@ -607,6 +637,11 @@ void synchronize_sched_expedited(void)
{
struct rcu_state *rsp = &rcu_sched_state;
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched_expedited() in RCU read-side critical section");
+
/* If only one CPU, this is automatically a grace period. */
if (rcu_blocking_is_gp())
return;
@@ -676,6 +711,13 @@ void synchronize_rcu_expedited(void)
{
struct rcu_state *rsp = rcu_state_p;
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
_synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -693,3 +735,15 @@ void synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Switch to run-time mode once Tree RCU has fully initialized.
+ */
+static int __init rcu_exp_runtime_mode(void)
+{
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ rcu_test_sync_prims();
+ return 0;
+}
+core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 85c5a883c6e3..a240f3308be6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -670,7 +670,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (!rcu_scheduler_active)
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
@@ -1643,7 +1643,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
"N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
ticks_value, ticks_title,
- atomic_read(&rdtp->dynticks) & 0xfff,
+ rcu_dynticks_snap(rdtp) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
@@ -2366,8 +2366,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
}
/*
- * Each pass through this loop sets up one rcu_data structure and
- * spawns one rcu_nocb_kthread().
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
*/
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index b1f28972872c..8751a748499a 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -124,7 +124,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
rdp->core_needs_qs);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
- atomic_read(&rdp->dynticks->dynticks),
+ rcu_dynticks_snap(rdp->dynticks),
rdp->dynticks->dynticks_nesting,
rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
@@ -194,9 +194,8 @@ static int show_rcuexp(struct seq_file *m, void *v)
s2 += atomic_long_read(&rdp->exp_workdone2);
s3 += atomic_long_read(&rdp->exp_workdone3);
}
- seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
rsp->expedited_sequence, s0, s1, s2, s3,
- atomic_long_read(&rsp->expedited_normal),
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
return 0;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f19271dce0a9..9e03db9ea9c0 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -121,27 +121,30 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
* Should expedited grace-period primitives always fall back to their
* non-expedited counterparts? Intended for use within RCU. Note
* that if the user specifies both rcu_expedited and rcu_normal, then
- * rcu_normal wins.
+ * rcu_normal wins. (Except during the time period during boot from
+ * when the first task is spawned until the rcu_exp_runtime_mode()
+ * core_initcall() is invoked, at which point everything is expedited.)
*/
bool rcu_gp_is_normal(void)
{
- return READ_ONCE(rcu_normal);
+ return READ_ONCE(rcu_normal) &&
+ rcu_scheduler_active != RCU_SCHEDULER_INIT;
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
-static atomic_t rcu_expedited_nesting =
- ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
- * sysfs/boot variable into account as well as the rcu_expedite_gp()
- * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
- * returns false is a -really- bad idea.
+ * sysfs/boot variable and rcu_scheduler_active into account as well
+ * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
+ * until rcu_gp_is_expedited() returns false is a -really- bad idea.
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
+ rcu_scheduler_active == RCU_SCHEDULER_INIT;
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -178,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
*/
void rcu_end_inkernel_boot(void)
{
- if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
- rcu_unexpedite_gp();
+ rcu_unexpedite_gp();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
}
@@ -257,7 +259,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -591,7 +593,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
void synchronize_rcu_tasks(void)
{
/* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(!rcu_scheduler_active,
+ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
@@ -813,6 +815,23 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */
+/*
+ * Test each non-SRCU synchronous grace-period wait API. This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+ if (!IS_ENABLED(CONFIG_PROVE_RCU))
+ return;
+ synchronize_rcu();
+ synchronize_rcu_bh();
+ synchronize_sched();
+ synchronize_rcu_expedited();
+ synchronize_rcu_bh_expedited();
+ synchronize_sched_expedited();
+}
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -865,6 +884,7 @@ void rcu_early_boot_tests(void)
early_boot_test_call_rcu_bh();
if (rcu_self_test_sched)
early_boot_test_call_rcu_sched();
+ rcu_test_sync_prims();
}
static int rcu_verify_early_boot_tests(void)
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b832ae2b..89ab6758667b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,8 +18,8 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/autogroup.c
index da39489d2d80..da39489d2d80 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/autogroup.c
diff --git a/kernel/sched/auto_group.h b/kernel/sched/autogroup.h
index 890c95f2587a..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/autogroup.h
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e85a725e5c34..ad64efe41722 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -77,41 +77,88 @@ EXPORT_SYMBOL_GPL(sched_clock);
__read_mostly int sched_clock_running;
+void sched_clock_init(void)
+{
+ sched_clock_running = 1;
+}
+
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
-static int __sched_clock_stable_early;
+/*
+ * We must start with !__sched_clock_stable because the unstable -> stable
+ * transition is accurate, while the stable -> unstable transition is not.
+ *
+ * Similarly we start with __sched_clock_stable_early, thereby assuming we
+ * will become stable, such that there's only a single 1 -> 0 transition.
+ */
+static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
+static int __sched_clock_stable_early = 1;
-int sched_clock_stable(void)
+/*
+ * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
+ */
+static __read_mostly u64 raw_offset;
+static __read_mostly u64 gtod_offset;
+
+struct sched_clock_data {
+ u64 tick_raw;
+ u64 tick_gtod;
+ u64 clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
{
- return static_key_false(&__sched_clock_stable);
+ return this_cpu_ptr(&sched_clock_data);
}
-static void __set_sched_clock_stable(void)
+static inline struct sched_clock_data *cpu_sdc(int cpu)
{
- if (!sched_clock_stable())
- static_key_slow_inc(&__sched_clock_stable);
+ return &per_cpu(sched_clock_data, cpu);
+}
- tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
+int sched_clock_stable(void)
+{
+ return static_branch_likely(&__sched_clock_stable);
}
-void set_sched_clock_stable(void)
+static void __set_sched_clock_stable(void)
{
- __sched_clock_stable_early = 1;
+ struct sched_clock_data *scd = this_scd();
- smp_mb(); /* matches sched_clock_init() */
+ /*
+ * Attempt to make the (initial) unstable->stable transition continuous.
+ */
+ raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
- if (!sched_clock_running)
- return;
+ printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
+ scd->tick_gtod, gtod_offset,
+ scd->tick_raw, raw_offset);
- __set_sched_clock_stable();
+ static_branch_enable(&__sched_clock_stable);
+ tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
static void __clear_sched_clock_stable(struct work_struct *work)
{
- /* XXX worry about clock continuity */
- if (sched_clock_stable())
- static_key_slow_dec(&__sched_clock_stable);
+ struct sched_clock_data *scd = this_scd();
+
+ /*
+ * Attempt to make the stable->unstable transition continuous.
+ *
+ * Trouble is, this is typically called from the TSC watchdog
+ * timer, which is late per definition. This means the tick
+ * values can already be screwy.
+ *
+ * Still do what we can.
+ */
+ gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
+
+ printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+ scd->tick_gtod, gtod_offset,
+ scd->tick_raw, raw_offset);
+ static_branch_disable(&__sched_clock_stable);
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
@@ -121,47 +168,15 @@ void clear_sched_clock_stable(void)
{
__sched_clock_stable_early = 0;
- smp_mb(); /* matches sched_clock_init() */
-
- if (!sched_clock_running)
- return;
+ smp_mb(); /* matches sched_clock_init_late() */
- schedule_work(&sched_clock_work);
+ if (sched_clock_running == 2)
+ schedule_work(&sched_clock_work);
}
-struct sched_clock_data {
- u64 tick_raw;
- u64 tick_gtod;
- u64 clock;
-};
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-
-static inline struct sched_clock_data *this_scd(void)
+void sched_clock_init_late(void)
{
- return this_cpu_ptr(&sched_clock_data);
-}
-
-static inline struct sched_clock_data *cpu_sdc(int cpu)
-{
- return &per_cpu(sched_clock_data, cpu);
-}
-
-void sched_clock_init(void)
-{
- u64 ktime_now = ktime_to_ns(ktime_get());
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct sched_clock_data *scd = cpu_sdc(cpu);
-
- scd->tick_raw = 0;
- scd->tick_gtod = ktime_now;
- scd->clock = ktime_now;
- }
-
- sched_clock_running = 1;
-
+ sched_clock_running = 2;
/*
* Ensure that it is impossible to not do a static_key update.
*
@@ -173,8 +188,6 @@ void sched_clock_init(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
- else
- __clear_sched_clock_stable(NULL);
}
/*
@@ -216,7 +229,7 @@ again:
* scd->tick_gtod + TICK_NSEC);
*/
- clock = scd->tick_gtod + delta;
+ clock = scd->tick_gtod + gtod_offset + delta;
min_clock = wrap_max(scd->tick_gtod, old_clock);
max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
@@ -302,7 +315,7 @@ u64 sched_clock_cpu(int cpu)
u64 clock;
if (sched_clock_stable())
- return sched_clock();
+ return sched_clock() + raw_offset;
if (unlikely(!sched_clock_running))
return 0ull;
@@ -323,23 +336,22 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu);
void sched_clock_tick(void)
{
struct sched_clock_data *scd;
- u64 now, now_gtod;
-
- if (sched_clock_stable())
- return;
-
- if (unlikely(!sched_clock_running))
- return;
WARN_ON_ONCE(!irqs_disabled());
+ /*
+ * Update these values even if sched_clock_stable(), because it can
+ * become unstable at any point in time at which point we need some
+ * values to fall back on.
+ *
+ * XXX arguably we can skip this if we expose tsc_clocksource_reliable
+ */
scd = this_scd();
- now_gtod = ktime_to_ns(ktime_get());
- now = sched_clock();
+ scd->tick_raw = sched_clock();
+ scd->tick_gtod = ktime_get_ns();
- scd->tick_raw = now;
- scd->tick_gtod = now_gtod;
- sched_clock_local(scd);
+ if (!sched_clock_stable() && likely(sched_clock_running))
+ sched_clock_local(scd);
}
/*
@@ -366,11 +378,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-void sched_clock_init(void)
-{
- sched_clock_running = 1;
-}
-
u64 sched_clock_cpu(int cpu)
{
if (unlikely(!sched_clock_running))
@@ -378,6 +385,7 @@ u64 sched_clock_cpu(int cpu)
return sched_clock();
}
+
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
/*
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 8d0f35debf35..f063a25d4449 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -31,7 +31,8 @@ void complete(struct completion *x)
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
+ if (x->done != UINT_MAX)
+ x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
@@ -51,7 +52,7 @@ void complete_all(struct completion *x)
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
- x->done += UINT_MAX/2;
+ x->done = UINT_MAX;
__wake_up_locked(&x->wait, TASK_NORMAL, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
@@ -79,7 +80,8 @@ do_wait_for_common(struct completion *x,
if (!x->done)
return timeout;
}
- x->done--;
+ if (x->done != UINT_MAX)
+ x->done--;
return timeout ?: 1;
}
@@ -280,7 +282,7 @@ bool try_wait_for_completion(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
- else
+ else if (x->done != UINT_MAX)
x->done--;
spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c56fb57f2991..34e2291a9a6c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,88 +1,28 @@
/*
* kernel/sched/core.c
*
- * Kernel scheduler and related syscalls
+ * Core kernel scheduler code and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
- *
- * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
- * make semaphores SMP safe
- * 1998-11-19 Implemented schedule_timeout() and related stuff
- * by Andrea Arcangeli
- * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
- * hybrid priority-list and round-robin design with
- * an array-switch method of distributing timeslices
- * and per-CPU runqueues. Cleanups and useful suggestions
- * by Davide Libenzi, preemptible kernel bits by Robert Love.
- * 2003-09-03 Interactivity tuning by Con Kolivas.
- * 2004-04-02 Scheduler domains code by Nick Piggin
- * 2007-04-15 Work begun on replacing all interactivity tuning with a
- * fair scheduling design by Con Kolivas.
- * 2007-05-05 Load balancing (smp-nice) and other improvements
- * by Peter Williams
- * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
- * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
- * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
- * Thomas Gleixner, Mike Kravetz
*/
-
-#include <linux/kasan.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <linux/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/pid_namespace.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
+#include <linux/sched.h>
#include <linux/cpuset.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/sysctl.h>
-#include <linux/syscalls.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <linux/unistd.h>
-#include <linux/pagemap.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
-#include <linux/ctype.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
-#include <linux/compiler.h>
-#include <linux/frame.h>
+
+#include <linux/blkdev.h>
+#include <linux/kprobes.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
#include <linux/prefetch.h>
-#include <linux/mutex.h>
+#include <linux/profile.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include <asm/irq_regs.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
#include "sched.h"
#include "../workqueue_internal.h"
@@ -91,27 +31,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-
-void update_rq_clock(struct rq *rq)
-{
- s64 delta;
-
- lockdep_assert_held(&rq->lock);
-
- if (rq->clock_skip_update & RQCF_ACT_SKIP)
- return;
-
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
- if (delta < 0)
- return;
- rq->clock += delta;
- update_rq_clock_task(rq, delta);
-}
-
/*
* Debugging: various feature bits
*/
@@ -140,7 +61,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
/*
- * period over which we measure -rt task cpu usage in us.
+ * period over which we measure -rt task CPU usage in us.
* default: 1s
*/
unsigned int sysctl_sched_rt_period = 1000000;
@@ -153,7 +74,7 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/* cpus with isolated domains */
+/* CPUs with isolated domains */
cpumask_var_t cpu_isolated_map;
/*
@@ -185,7 +106,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
return rq;
}
raw_spin_unlock(&rq->lock);
@@ -221,11 +142,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* If we observe the old cpu in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores.
*
- * If we observe the new cpu in task_rq_lock, the acquire will
+ * If we observe the new CPU in task_rq_lock, the acquire will
* pair with the WMB to ensure we must then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
return rq;
}
raw_spin_unlock(&rq->lock);
@@ -236,6 +157,84 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
}
}
+/*
+ * RQ-clock updating methods:
+ */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq += steal;
+ delta -= steal;
+ }
+#endif
+
+ rq->clock_task += delta;
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
+}
+
+void update_rq_clock(struct rq *rq)
+{
+ s64 delta;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_update_flags & RQCF_ACT_SKIP)
+ return;
+
+#ifdef CONFIG_SCHED_DEBUG
+ rq->clock_update_flags |= RQCF_UPDATED;
+#endif
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ if (delta < 0)
+ return;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
+}
+
+
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
@@ -458,7 +457,7 @@ void wake_up_q(struct wake_q_head *head)
task = container_of(node, struct task_struct, wake_q);
BUG_ON(!task);
- /* task can safely be re-inserted now */
+ /* Task can safely be re-inserted now: */
node = node->next;
task->wake_q.next = NULL;
@@ -516,12 +515,12 @@ void resched_cpu(int cpu)
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu. This is good for power-savings.
+ * In the semi idle case, use the nearest busy CPU for migrating timers
+ * from an idle CPU. This is good for power-savings.
*
* We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ * selecting an idle CPU will add more delays to the timers than intended
+ * (as that CPU's timer base may not be uptodate wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
@@ -550,6 +549,7 @@ unlock:
rcu_read_unlock();
return cpu;
}
+
/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
@@ -784,60 +784,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- s64 steal = 0, irq_delta = 0;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}irq region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}irq
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
-
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
- steal -= rq->prev_steal_time_rq;
-
- if (unlikely(steal > delta))
- steal = delta;
-
- rq->prev_steal_time_rq += steal;
- delta -= steal;
- }
-#endif
-
- rq->clock_task += delta;
-
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
- sched_rt_avg_update(rq, irq_delta + steal);
-#endif
-}
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1018,7 +964,7 @@ struct migration_arg {
};
/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * Move (not current) task off this CPU, onto the destination CPU. We're doing
* this because either it can't run here any more (set_cpus_allowed()
* away from this CPU, or CPU going down), or because we're
* attempting to rebalance this task on exec (sched_exec).
@@ -1052,8 +998,8 @@ static int migration_cpu_stop(void *data)
struct rq *rq = this_rq();
/*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
+ * The original target CPU might have gone down and we might
+ * be on another CPU but it doesn't matter.
*/
local_irq_disable();
/*
@@ -1171,7 +1117,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (p->flags & PF_KTHREAD) {
/*
* For kernel threads that do indeed end up on online &&
- * !active we want to ensure they are strict per-cpu threads.
+ * !active we want to ensure they are strict per-CPU threads.
*/
WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
!cpumask_intersects(new_mask, cpu_active_mask) &&
@@ -1195,9 +1141,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq_unpin_lock(rq, &rf);
rq = move_queued_task(rq, p, dest_cpu);
- lockdep_repin_lock(&rq->lock, rf.cookie);
+ rq_repin_lock(rq, &rf);
}
out:
task_rq_unlock(rq, p, &rf);
@@ -1276,7 +1222,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
/*
* Task isn't running anymore; make it appear like we migrated
* it before it went to sleep. This means on wakeup we make the
- * previous cpu our target instead of where it really is.
+ * previous CPU our target instead of where it really is.
*/
p->wake_cpu = cpu;
}
@@ -1508,12 +1454,12 @@ EXPORT_SYMBOL_GPL(kick_process);
*
* - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
* see __set_cpus_allowed_ptr(). At this point the newly online
- * cpu isn't yet part of the sched domains, and balancing will not
+ * CPU isn't yet part of the sched domains, and balancing will not
* see it.
*
- * - on cpu-down we clear cpu_active() to mask the sched domains and
+ * - on CPU-down we clear cpu_active() to mask the sched domains and
* avoid the load balancer to place new tasks on the to be removed
- * cpu. Existing tasks will remain running there and will be taken
+ * CPU. Existing tasks will remain running there and will be taken
* off.
*
* This means that fallback selection must not select !active CPUs.
@@ -1529,9 +1475,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
int dest_cpu;
/*
- * If the node that the cpu is on has been offlined, cpu_to_node()
- * will return -1. There is no cpu on the node, and we should
- * select the cpu on the other node.
+ * If the node that the CPU is on has been offlined, cpu_to_node()
+ * will return -1. There is no CPU on the node, and we should
+ * select the CPU on the other node.
*/
if (nid != -1) {
nodemask = cpumask_of_node(nid);
@@ -1563,7 +1509,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
state = possible;
break;
}
- /* fall-through */
+ /* Fall-through */
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
@@ -1607,7 +1553,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
/*
* In order not to call set_task_cpu() on a blocking task we need
* to rely on ttwu() to place the task on a valid ->cpus_allowed
- * cpu.
+ * CPU.
*
* Since this is common to all placement strategies, this lives here.
*
@@ -1681,7 +1627,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
- /* if a worker is waking up, notify workqueue */
+ /* If a worker is waking up, notify the workqueue: */
if (p->flags & PF_WQ_WORKER)
wq_worker_waking_up(p, cpu_of(rq));
}
@@ -1690,7 +1636,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
* Mark the task runnable and perform wakeup-preemption.
*/
static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct pin_cookie cookie)
+ struct rq_flags *rf)
{
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -1702,9 +1648,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
* Our task @p is fully woken up and running; so its safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
p->sched_class->task_woken(rq, p);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
}
if (rq->idle_stamp) {
@@ -1723,7 +1669,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct pin_cookie cookie)
+ struct rq_flags *rf)
{
int en_flags = ENQUEUE_WAKEUP;
@@ -1738,7 +1684,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
ttwu_activate(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, cookie);
+ ttwu_do_wakeup(rq, p, wake_flags, rf);
}
/*
@@ -1757,7 +1703,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
+ ttwu_do_wakeup(rq, p, wake_flags, &rf);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -1770,15 +1716,15 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct pin_cookie cookie;
struct task_struct *p;
unsigned long flags;
+ struct rq_flags rf;
if (!llist)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
- cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, &rf);
while (llist) {
int wake_flags = 0;
@@ -1789,10 +1735,10 @@ void sched_ttwu_pending(void)
if (p->sched_remote_wakeup)
wake_flags = WF_MIGRATED;
- ttwu_do_activate(rq, p, wake_flags, cookie);
+ ttwu_do_activate(rq, p, wake_flags, &rf);
}
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1864,7 +1810,7 @@ void wake_up_if_idle(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
if (is_idle_task(rq->curr))
smp_send_reschedule(cpu);
- /* Else cpu is not in idle, do nothing here */
+ /* Else CPU is not idle, do nothing here: */
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1881,20 +1827,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
- struct pin_cookie cookie;
+ struct rq_flags rf;
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
ttwu_queue_remote(p, cpu, wake_flags);
return;
}
#endif
raw_spin_lock(&rq->lock);
- cookie = lockdep_pin_lock(&rq->lock);
- ttwu_do_activate(rq, p, wake_flags, cookie);
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_pin_lock(rq, &rf);
+ ttwu_do_activate(rq, p, wake_flags, &rf);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock(&rq->lock);
}
@@ -1904,8 +1850,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* MIGRATION
*
* The basic program-order guarantee on SMP systems is that when a task [t]
- * migrates, all its activity on its old cpu [c0] happens-before any subsequent
- * execution on its new cpu [c1].
+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent
+ * execution on its new CPU [c1].
*
* For migration (of runnable tasks) this is provided by the following means:
*
@@ -1916,7 +1862,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
*
* Transitivity guarantees that B happens after A and C after B.
* Note: we only require RCpc transitivity.
- * Note: the cpu doing B need not be c0 or c1
+ * Note: the CPU doing B need not be c0 or c1
*
* Example:
*
@@ -2024,7 +1970,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
trace_sched_waking(p);
- success = 1; /* we're going to change ->state */
+ /* We're going to change ->state: */
+ success = 1;
cpu = task_cpu(p);
/*
@@ -2073,7 +2020,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
smp_rmb();
/*
- * If the owning (remote) cpu is still in the middle of schedule() with
+ * If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_lock_switch().
@@ -2086,11 +2033,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
+#else /* CONFIG_SMP */
+
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2111,7 +2071,7 @@ out:
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task.
*/
-static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
+static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
{
struct rq *rq = task_rq(p);
@@ -2128,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
}
if (!(p->state & TASK_NORMAL))
@@ -2140,10 +2100,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
trace_sched_waking(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&rq->nr_iowait);
+ }
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }
- ttwu_do_wakeup(rq, p, 0, cookie);
+ ttwu_do_wakeup(rq, p, 0, rf);
ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
@@ -2427,7 +2392,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
/*
- * We're setting the cpu for the first time, we don't migrate,
+ * We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
*/
__set_task_cpu(p, cpu);
@@ -2570,7 +2535,7 @@ void wake_up_new_task(struct task_struct *p)
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
- * - any previously selected cpu might disappear through hotplug
+ * - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
@@ -2578,6 +2543,7 @@ void wake_up_new_task(struct task_struct *p)
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
activate_task(rq, p, 0);
@@ -2590,9 +2556,9 @@ void wake_up_new_task(struct task_struct *p)
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
- lockdep_repin_lock(&rq->lock, rf.cookie);
+ rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
@@ -2861,7 +2827,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next, struct pin_cookie cookie)
+ struct task_struct *next, struct rq_flags *rf)
{
struct mm_struct *mm, *oldmm;
@@ -2887,13 +2853,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
+
+ rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
/*
* Since the runqueue lock will be released by the next
* task (which is an invalid locking op but in the case
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
@@ -2920,7 +2889,7 @@ unsigned long nr_running(void)
}
/*
- * Check if only the current task is running on the cpu.
+ * Check if only the current task is running on the CPU.
*
* Caution: this function does not check that the caller has disabled
* preemption, thus the result might have a time-of-check-to-time-of-use
@@ -2949,6 +2918,36 @@ unsigned long long nr_context_switches(void)
return sum;
}
+/*
+ * IO-wait accounting, and how its mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
unsigned long nr_iowait(void)
{
unsigned long i, sum = 0;
@@ -2959,6 +2958,13 @@ unsigned long nr_iowait(void)
return sum;
}
+/*
+ * Consumers of these two interfaces, like for example the cpufreq menu
+ * governor are using nonsensical data. Boosting frequency for a CPU that has
+ * IO-wait which might not even end up running the task when it does become
+ * runnable.
+ */
+
unsigned long nr_iowait_cpu(int cpu)
{
struct rq *this = cpu_rq(cpu);
@@ -3042,8 +3048,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* So we have a optimization chance when the task's delta_exec is 0.
* Reading ->on_cpu is racy, but this is ok.
*
- * If we race with it leaving cpu, we'll take a lock. So we're correct.
- * If we race with it entering cpu, unaccounted time is 0. This is
+ * If we race with it leaving CPU, we'll take a lock. So we're correct.
+ * If we race with it entering CPU, unaccounted time is 0. This is
* indistinguishable from the read occurring a few cycles earlier.
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
@@ -3257,31 +3263,30 @@ static inline void schedule_debug(struct task_struct *prev)
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- const struct sched_class *class = &fair_sched_class;
+ const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(prev->sched_class == class &&
- rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev, cookie);
+ if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+ p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
- /* assumes fair_sched_class->next == idle_sched_class */
+ /* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev, cookie);
+ p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
again:
for_each_class(class) {
- p = class->pick_next_task(rq, prev, cookie);
+ p = class->pick_next_task(rq, prev, rf);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
@@ -3289,7 +3294,8 @@ again:
}
}
- BUG(); /* the idle class will always have a runnable task */
+ /* The idle class should always have a runnable task: */
+ BUG();
}
/*
@@ -3335,7 +3341,7 @@ static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
- struct pin_cookie cookie;
+ struct rq_flags rf;
struct rq *rq;
int cpu;
@@ -3358,9 +3364,10 @@ static void __sched notrace __schedule(bool preempt)
*/
smp_mb__before_spinlock();
raw_spin_lock(&rq->lock);
- cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, &rf);
- rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+ /* Promote REQ to ACT */
+ rq->clock_update_flags <<= 1;
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
@@ -3370,6 +3377,11 @@ static void __sched notrace __schedule(bool preempt)
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
+ if (prev->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+
/*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
@@ -3380,7 +3392,7 @@ static void __sched notrace __schedule(bool preempt)
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
- try_to_wake_up_local(to_wakeup, cookie);
+ try_to_wake_up_local(to_wakeup, &rf);
}
}
switch_count = &prev->nvcsw;
@@ -3389,10 +3401,9 @@ static void __sched notrace __schedule(bool preempt)
if (task_on_rq_queued(prev))
update_rq_clock(rq);
- next = pick_next_task(rq, prev, cookie);
+ next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- rq->clock_skip_update = 0;
if (likely(prev != next)) {
rq->nr_switches++;
@@ -3400,9 +3411,12 @@ static void __sched notrace __schedule(bool preempt)
++*switch_count;
trace_sched_switch(preempt, prev, next);
- rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
+
+ /* Also unlocks the rq: */
+ rq = context_switch(rq, prev, next, &rf);
} else {
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock_irq(&rq->lock);
}
@@ -3426,14 +3440,18 @@ void __noreturn do_task_dead(void)
smp_mb();
raw_spin_unlock_wait(&current->pi_lock);
- /* causes final put_task_struct in finish_task_switch(). */
+ /* Causes final put_task_struct in finish_task_switch(): */
__set_current_state(TASK_DEAD);
- current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
+
+ /* Tell freezer to ignore us: */
+ current->flags |= PF_NOFREEZE;
+
__schedule(false);
BUG();
- /* Avoid "noreturn function does return". */
+
+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
for (;;)
- cpu_relax(); /* For when BUG is null */
+ cpu_relax();
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -3651,6 +3669,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
/*
* Idle task boosting is a nono in general. There is one
@@ -3725,7 +3744,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
- preempt_disable(); /* avoid rq from going away on us */
+ /* Avoid rq from going away on us: */
+ preempt_disable();
__task_rq_unlock(rq, &rf);
balance_callback(rq);
@@ -3747,6 +3767,8 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -3793,7 +3815,7 @@ EXPORT_SYMBOL(set_user_nice);
*/
int can_nice(const struct task_struct *p, const int nice)
{
- /* convert nice value [19,-20] to rlimit style value [1,40] */
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
@@ -3849,7 +3871,7 @@ int task_prio(const struct task_struct *p)
}
/**
- * idle_cpu - is a given cpu idle currently?
+ * idle_cpu - is a given CPU idle currently?
* @cpu: the processor in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
@@ -3873,10 +3895,10 @@ int idle_cpu(int cpu)
}
/**
- * idle_task - return the idle task for a given cpu.
+ * idle_task - return the idle task for a given CPU.
* @cpu: the processor in question.
*
- * Return: The idle task for the cpu @cpu.
+ * Return: The idle task for the CPU @cpu.
*/
struct task_struct *idle_task(int cpu)
{
@@ -4042,7 +4064,7 @@ __checkparam_dl(const struct sched_attr *attr)
}
/*
- * check the target process has a UID that matches the current process's
+ * Check the target process has a UID that matches the current process's:
*/
static bool check_same_owner(struct task_struct *p)
{
@@ -4057,8 +4079,7 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
-static bool dl_param_changed(struct task_struct *p,
- const struct sched_attr *attr)
+static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -4085,10 +4106,10 @@ static int __sched_setscheduler(struct task_struct *p,
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq;
- /* may grab non-irq protected spin_locks */
+ /* May grab non-irq protected spin_locks: */
BUG_ON(in_interrupt());
recheck:
- /* double check policy once rq lock held */
+ /* Double check policy once rq lock held: */
if (policy < 0) {
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
@@ -4128,11 +4149,11 @@ recheck:
unsigned long rlim_rtprio =
task_rlimit(p, RLIMIT_RTPRIO);
- /* can't set/change the rt policy */
+ /* Can't set/change the rt policy: */
if (policy != p->policy && !rlim_rtprio)
return -EPERM;
- /* can't increase priority */
+ /* Can't increase priority: */
if (attr->sched_priority > p->rt_priority &&
attr->sched_priority > rlim_rtprio)
return -EPERM;
@@ -4156,11 +4177,11 @@ recheck:
return -EPERM;
}
- /* can't change other user's priorities */
+ /* Can't change other user's priorities: */
if (!check_same_owner(p))
return -EPERM;
- /* Normal users shall not reset the sched_reset_on_fork flag */
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
if (p->sched_reset_on_fork && !reset_on_fork)
return -EPERM;
}
@@ -4172,16 +4193,17 @@ recheck:
}
/*
- * make sure no PI-waiters arrive (or leave) while we are
+ * Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
*
* To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
/*
- * Changing the policy of the stop threads its a very bad idea
+ * Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
task_rq_unlock(rq, p, &rf);
@@ -4237,7 +4259,7 @@ change:
#endif
}
- /* recheck policy now with rq lock held */
+ /* Re-check policy now with rq lock held: */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
@@ -4294,15 +4316,15 @@ change:
set_curr_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
- preempt_disable(); /* avoid rq from going away on us */
+
+ /* Avoid rq from going away on us: */
+ preempt_disable();
task_rq_unlock(rq, p, &rf);
if (pi)
rt_mutex_adjust_pi(p);
- /*
- * Run balance callbacks after we've adjusted the PI chain.
- */
+ /* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
preempt_enable();
@@ -4395,8 +4417,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
/*
* Mimics kernel/events/core.c perf_copy_attr().
*/
-static int sched_copy_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr)
+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
{
u32 size;
int ret;
@@ -4404,19 +4425,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
return -EFAULT;
- /*
- * zero the full structure, so that a short copy will be nice.
- */
+ /* Zero the full structure, so that a short copy will be nice: */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
- if (size > PAGE_SIZE) /* silly large */
+ /* Bail out on silly large: */
+ if (size > PAGE_SIZE)
goto err_size;
- if (!size) /* abi compat */
+ /* ABI compatibility quirk: */
+ if (!size)
size = SCHED_ATTR_SIZE_VER0;
if (size < SCHED_ATTR_SIZE_VER0)
@@ -4451,7 +4472,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
return -EFAULT;
/*
- * XXX: do we want to be lenient like existing syscalls; or do we want
+ * XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
*/
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
@@ -4471,10 +4492,8 @@ err_size:
*
* Return: 0 on success. An error code otherwise.
*/
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
- struct sched_param __user *, param)
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
{
- /* negative values for policy are not valid */
if (policy < 0)
return -EINVAL;
@@ -4784,10 +4803,10 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
}
/**
- * sys_sched_setaffinity - set the cpu affinity of a process
+ * sys_sched_setaffinity - set the CPU affinity of a process
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
+ * @user_mask_ptr: user-space pointer to the new CPU mask
*
* Return: 0 on success. An error code otherwise.
*/
@@ -4835,10 +4854,10 @@ out_unlock:
}
/**
- * sys_sched_getaffinity - get the cpu affinity of a process
+ * sys_sched_getaffinity - get the CPU affinity of a process
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
*
* Return: size of CPU mask copied to user_mask_ptr on success. An
* error code otherwise.
@@ -4966,7 +4985,7 @@ EXPORT_SYMBOL(__cond_resched_softirq);
* Typical broken usage is:
*
* while (!event)
- * yield();
+ * yield();
*
* where one assumes that yield() will let 'the other' process run that will
* make event true. If the current task is a SCHED_FIFO task that will never
@@ -5057,31 +5076,48 @@ out_irq:
}
EXPORT_SYMBOL_GPL(yield_to);
+int io_schedule_prepare(void)
+{
+ int old_iowait = current->in_iowait;
+
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
+ return old_iowait;
+}
+
+void io_schedule_finish(int token)
+{
+ current->in_iowait = token;
+}
+
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
long __sched io_schedule_timeout(long timeout)
{
- int old_iowait = current->in_iowait;
- struct rq *rq;
+ int token;
long ret;
- current->in_iowait = 1;
- blk_schedule_flush_plug(current);
-
- delayacct_blkio_start();
- rq = raw_rq();
- atomic_inc(&rq->nr_iowait);
+ token = io_schedule_prepare();
ret = schedule_timeout(timeout);
- current->in_iowait = old_iowait;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
+ io_schedule_finish(token);
return ret;
}
EXPORT_SYMBOL(io_schedule_timeout);
+void io_schedule(void)
+{
+ int token;
+
+ token = io_schedule_prepare();
+ schedule();
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(io_schedule);
+
/**
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
@@ -5264,7 +5300,7 @@ void init_idle_bootup_task(struct task_struct *idle)
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
- * @cpu: cpu the idle task belongs to
+ * @cpu: CPU the idle task belongs to
*
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
@@ -5295,7 +5331,7 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
/*
* We're having a chicken and egg problem, even though we are
- * holding rq->lock, the cpu isn't yet set to this cpu so the
+ * holding rq->lock, the CPU isn't yet set to this CPU so the
* lockdep check in task_group() will fail.
*
* Similar case to sched_fork(). / Alternatively we could
@@ -5360,7 +5396,7 @@ int task_can_attach(struct task_struct *p,
/*
* Kthreads which disallow setaffinity shouldn't be moved
- * to a new cpuset; we don't want to change their cpu
+ * to a new cpuset; we don't want to change their CPU
* affinity and isolating such threads by their set of
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
@@ -5409,7 +5445,7 @@ out:
#ifdef CONFIG_SMP
-static bool sched_smp_initialized __read_mostly;
+bool sched_smp_initialized __read_mostly;
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -5461,7 +5497,7 @@ void sched_setnuma(struct task_struct *p, int nid)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Ensures that the idle task is using init_mm right before its cpu goes
+ * Ensure that the idle task is using init_mm right before its CPU goes
* offline.
*/
void idle_task_exit(void)
@@ -5521,7 +5557,7 @@ static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct pin_cookie cookie;
+ struct rq_flags rf, old_rf;
int dest_cpu;
/*
@@ -5545,16 +5581,16 @@ static void migrate_tasks(struct rq *dead_rq)
for (;;) {
/*
* There's this thread running, bail when that's the only
- * remaining thread.
+ * remaining thread:
*/
if (rq->nr_running == 1)
break;
/*
- * pick_next_task assumes pinned rq->lock.
+ * pick_next_task() assumes pinned rq->lock:
*/
- cookie = lockdep_pin_lock(&rq->lock);
- next = pick_next_task(rq, &fake_task, cookie);
+ rq_pin_lock(rq, &rf);
+ next = pick_next_task(rq, &fake_task, &rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5567,7 +5603,7 @@ static void migrate_tasks(struct rq *dead_rq)
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&next->pi_lock);
raw_spin_lock(&rq->lock);
@@ -5582,6 +5618,13 @@ static void migrate_tasks(struct rq *dead_rq)
continue;
}
+ /*
+ * __migrate_task() may return with a different
+ * rq->lock held and a new cookie in 'rf', but we need
+ * to preserve rf::clock_update_flags for 'dead_rq'.
+ */
+ old_rf = rf;
+
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
@@ -5590,6 +5633,7 @@ static void migrate_tasks(struct rq *dead_rq)
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
+ rf = old_rf;
}
raw_spin_unlock(&next->pi_lock);
}
@@ -5598,7 +5642,7 @@ static void migrate_tasks(struct rq *dead_rq)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static void set_rq_online(struct rq *rq)
+void set_rq_online(struct rq *rq)
{
if (!rq->online) {
const struct sched_class *class;
@@ -5613,7 +5657,7 @@ static void set_rq_online(struct rq *rq)
}
}
-static void set_rq_offline(struct rq *rq)
+void set_rq_offline(struct rq *rq)
{
if (rq->online) {
const struct sched_class *class;
@@ -5635,1647 +5679,10 @@ static void set_cpu_rq_start_time(unsigned int cpu)
rq->age_stamp = sched_clock_cpu(cpu);
}
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
-
-#ifdef CONFIG_SCHED_DEBUG
-
-static __read_mostly int sched_debug_enabled;
-
-static int __init sched_debug_setup(char *str)
-{
- sched_debug_enabled = 1;
-
- return 0;
-}
-early_param("sched_debug", sched_debug_setup);
-
-static inline bool sched_debug(void)
-{
- return sched_debug_enabled;
-}
-
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
- struct cpumask *groupmask)
-{
- struct sched_group *group = sd->groups;
-
- cpumask_clear(groupmask);
-
- printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
- return -1;
- }
-
- printk(KERN_CONT "span %*pbl level %s\n",
- cpumask_pr_args(sched_domain_span(sd)), sd->name);
-
- if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- printk(KERN_ERR "ERROR: domain->span does not contain "
- "CPU%d\n", cpu);
- }
- if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
- printk(KERN_ERR "ERROR: domain->groups does not contain"
- " CPU%d\n", cpu);
- }
-
- printk(KERN_DEBUG "%*s groups:", level + 1, "");
- do {
- if (!group) {
- printk("\n");
- printk(KERN_ERR "ERROR: group is NULL\n");
- break;
- }
-
- if (!cpumask_weight(sched_group_cpus(group))) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: empty group\n");
- break;
- }
-
- if (!(sd->flags & SD_OVERLAP) &&
- cpumask_intersects(groupmask, sched_group_cpus(group))) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: repeated CPUs\n");
- break;
- }
-
- cpumask_or(groupmask, groupmask, sched_group_cpus(group));
-
- printk(KERN_CONT " %*pbl",
- cpumask_pr_args(sched_group_cpus(group)));
- if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %lu)",
- group->sgc->capacity);
- }
-
- group = group->next;
- } while (group != sd->groups);
- printk(KERN_CONT "\n");
-
- if (!cpumask_equal(sched_domain_span(sd), groupmask))
- printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-
- if (sd->parent &&
- !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
- printk(KERN_ERR "ERROR: parent span is not a superset "
- "of domain->span\n");
- return 0;
-}
-
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
- int level = 0;
-
- if (!sched_debug_enabled)
- return;
-
- if (!sd) {
- printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
- return;
- }
-
- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-
- for (;;) {
- if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
- break;
- level++;
- sd = sd->parent;
- if (!sd)
- break;
- }
-}
-#else /* !CONFIG_SCHED_DEBUG */
-
-# define sched_debug_enabled 0
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
- return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
-static int sd_degenerate(struct sched_domain *sd)
-{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
-
- /* Following flags need at least 2 groups */
- if (sd->flags & (SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
- if (sd->groups != sd->groups->next)
- return 0;
- }
-
- /* Following flags don't use groups */
- if (sd->flags & (SD_WAKE_AFFINE))
- return 0;
-
- return 1;
-}
-
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
- unsigned long cflags = sd->flags, pflags = parent->flags;
-
- if (sd_degenerate(parent))
- return 1;
-
- if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
- return 0;
-
- /* Flags needing groups don't count if only 1 group in parent */
- if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
- if (~cflags & pflags)
- return 0;
-
- return 1;
-}
-
-static void free_rootdomain(struct rcu_head *rcu)
-{
- struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-
- cpupri_cleanup(&rd->cpupri);
- cpudl_cleanup(&rd->cpudl);
- free_cpumask_var(rd->dlo_mask);
- free_cpumask_var(rd->rto_mask);
- free_cpumask_var(rd->online);
- free_cpumask_var(rd->span);
- kfree(rd);
-}
-
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
- struct root_domain *old_rd = NULL;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- if (rq->rd) {
- old_rd = rq->rd;
-
- if (cpumask_test_cpu(rq->cpu, old_rd->online))
- set_rq_offline(rq);
-
- cpumask_clear_cpu(rq->cpu, old_rd->span);
-
- /*
- * If we dont want to free the old_rd yet then
- * set old_rd to NULL to skip the freeing later
- * in this function:
- */
- if (!atomic_dec_and_test(&old_rd->refcount))
- old_rd = NULL;
- }
-
- atomic_inc(&rd->refcount);
- rq->rd = rd;
-
- cpumask_set_cpu(rq->cpu, rd->span);
- if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
- set_rq_online(rq);
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-
- if (old_rd)
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-
-static int init_rootdomain(struct root_domain *rd)
-{
- memset(rd, 0, sizeof(*rd));
-
- if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
- goto out;
- if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
- goto free_span;
- if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
- goto free_online;
- if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
- goto free_dlo_mask;
-
- init_dl_bw(&rd->dl_bw);
- if (cpudl_init(&rd->cpudl) != 0)
- goto free_dlo_mask;
-
- if (cpupri_init(&rd->cpupri) != 0)
- goto free_rto_mask;
- return 0;
-
-free_rto_mask:
- free_cpumask_var(rd->rto_mask);
-free_dlo_mask:
- free_cpumask_var(rd->dlo_mask);
-free_online:
- free_cpumask_var(rd->online);
-free_span:
- free_cpumask_var(rd->span);
-out:
- return -ENOMEM;
-}
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-struct root_domain def_root_domain;
-
-static void init_defrootdomain(void)
-{
- init_rootdomain(&def_root_domain);
-
- atomic_set(&def_root_domain.refcount, 1);
-}
-
-static struct root_domain *alloc_rootdomain(void)
-{
- struct root_domain *rd;
-
- rd = kmalloc(sizeof(*rd), GFP_KERNEL);
- if (!rd)
- return NULL;
-
- if (init_rootdomain(rd) != 0) {
- kfree(rd);
- return NULL;
- }
-
- return rd;
-}
-
-static void free_sched_groups(struct sched_group *sg, int free_sgc)
-{
- struct sched_group *tmp, *first;
-
- if (!sg)
- return;
-
- first = sg;
- do {
- tmp = sg->next;
-
- if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
- kfree(sg->sgc);
-
- kfree(sg);
- sg = tmp;
- } while (sg != first);
-}
-
-static void destroy_sched_domain(struct sched_domain *sd)
-{
- /*
- * If its an overlapping domain it has private groups, iterate and
- * nuke them all.
- */
- if (sd->flags & SD_OVERLAP) {
- free_sched_groups(sd->groups, 1);
- } else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgc);
- kfree(sd->groups);
- }
- if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
- kfree(sd->shared);
- kfree(sd);
-}
-
-static void destroy_sched_domains_rcu(struct rcu_head *rcu)
-{
- struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
- while (sd) {
- struct sched_domain *parent = sd->parent;
- destroy_sched_domain(sd);
- sd = parent;
- }
-}
-
-static void destroy_sched_domains(struct sched_domain *sd)
-{
- if (sd)
- call_rcu(&sd->rcu, destroy_sched_domains_rcu);
-}
-
-/*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
- *
- * Also keep a unique ID per domain (we use the first cpu number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see cpus_share_cache().
- */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
-DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
-
-static void update_top_cache_domain(int cpu)
-{
- struct sched_domain_shared *sds = NULL;
- struct sched_domain *sd;
- int id = cpu;
- int size = 1;
-
- sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd) {
- id = cpumask_first(sched_domain_span(sd));
- size = cpumask_weight(sched_domain_span(sd));
- sds = sd->shared;
- }
-
- rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
- per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
- rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
-
- sd = lowest_flag_domain(cpu, SD_NUMA);
- rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
-
- sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
- rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
-}
-
-/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
- */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- struct sched_domain *tmp;
-
- /* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; ) {
- struct sched_domain *parent = tmp->parent;
- if (!parent)
- break;
-
- if (sd_parent_degenerate(tmp, parent)) {
- tmp->parent = parent->parent;
- if (parent->parent)
- parent->parent->child = tmp;
- /*
- * Transfer SD_PREFER_SIBLING down in case of a
- * degenerate parent; the spans match for this
- * so the property transfers.
- */
- if (parent->flags & SD_PREFER_SIBLING)
- tmp->flags |= SD_PREFER_SIBLING;
- destroy_sched_domain(parent);
- } else
- tmp = tmp->parent;
- }
-
- if (sd && sd_degenerate(sd)) {
- tmp = sd;
- sd = sd->parent;
- destroy_sched_domain(tmp);
- if (sd)
- sd->child = NULL;
- }
-
- sched_domain_debug(sd, cpu);
-
- rq_attach_root(rq, rd);
- tmp = rq->sd;
- rcu_assign_pointer(rq->sd, sd);
- destroy_sched_domains(tmp);
-
- update_top_cache_domain(cpu);
-}
-
-/* Setup the mask of cpus configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
- int ret;
-
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
- ret = cpulist_parse(str, cpu_isolated_map);
- if (ret) {
- pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
- return 0;
- }
- return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
-struct s_data {
- struct sched_domain ** __percpu sd;
- struct root_domain *rd;
-};
-
-enum s_alloc {
- sa_rootdomain,
- sa_sd,
- sa_sd_storage,
- sa_none,
-};
-
-/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
- *
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
- *
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * cpu they're built on, so check that.
- *
- */
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
-{
- const struct cpumask *span = sched_domain_span(sd);
- struct sd_data *sdd = sd->private;
- struct sched_domain *sibling;
- int i;
-
- for_each_cpu(i, span) {
- sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
- continue;
-
- cpumask_set_cpu(i, sched_group_mask(sg));
- }
-}
-
-/*
- * Return the canonical balance cpu for this group, this is the first cpu
- * of this group that's also in the iteration mask.
- */
-int group_balance_cpu(struct sched_group *sg)
-{
- return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
-}
-
-static int
-build_overlap_sched_groups(struct sched_domain *sd, int cpu)
-{
- struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
- const struct cpumask *span = sched_domain_span(sd);
- struct cpumask *covered = sched_domains_tmpmask;
- struct sd_data *sdd = sd->private;
- struct sched_domain *sibling;
- int i;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct cpumask *sg_span;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- sibling = *per_cpu_ptr(sdd->sd, i);
-
- /* See the comment near build_group_mask(). */
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
- continue;
-
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(cpu));
-
- if (!sg)
- goto fail;
-
- sg_span = sched_group_cpus(sg);
- if (sibling->child)
- cpumask_copy(sg_span, sched_domain_span(sibling->child));
- else
- cpumask_set_cpu(i, sg_span);
-
- cpumask_or(covered, covered, sg_span);
-
- sg->sgc = *per_cpu_ptr(sdd->sgc, i);
- if (atomic_inc_return(&sg->sgc->ref) == 1)
- build_group_mask(sd, sg);
-
- /*
- * Initialize sgc->capacity such that even if we mess up the
- * domains and no possible iteration will get us here, we won't
- * die on a /0 trap.
- */
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
-
- /*
- * Make sure the first group of this domain contains the
- * canonical balance cpu. Otherwise the sched_domain iteration
- * breaks. See update_sg_lb_stats().
- */
- if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- group_balance_cpu(sg) == cpu)
- groups = sg;
-
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- last->next = first;
- }
- sd->groups = groups;
-
- return 0;
-
-fail:
- free_sched_groups(first, 0);
-
- return -ENOMEM;
-}
-
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-{
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
- struct sched_domain *child = sd->child;
-
- if (child)
- cpu = cpumask_first(sched_domain_span(child));
-
- if (sg) {
- *sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
- atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
- }
-
- return cpu;
-}
-
-/*
- * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
- *
- * Assumes the sched_domain tree is fully constructed
- */
-static int
-build_sched_groups(struct sched_domain *sd, int cpu)
-{
- struct sched_group *first = NULL, *last = NULL;
- struct sd_data *sdd = sd->private;
- const struct cpumask *span = sched_domain_span(sd);
- struct cpumask *covered;
- int i;
-
- get_group(cpu, sdd, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (cpu != cpumask_first(span))
- return 0;
-
- lockdep_assert_held(&sched_domains_mutex);
- covered = sched_domains_tmpmask;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct sched_group *sg;
- int group, j;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- group = get_group(i, sdd, &sg);
- cpumask_setall(sched_group_mask(sg));
-
- for_each_cpu(j, span) {
- if (get_group(j, sdd, NULL) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
-
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- }
- last->next = first;
-
- return 0;
-}
-
-/*
- * Initialize sched groups cpu_capacity.
- *
- * cpu_capacity indicates the capacity of sched group, which is used while
- * distributing the load between different sched groups in a sched domain.
- * Typically cpu_capacity for all the groups in a sched domain will be same
- * unless there are asymmetries in the topology. If there are asymmetries,
- * group having more cpu_capacity will pickup more load compared to the
- * group having less cpu_capacity.
- */
-static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
-{
- struct sched_group *sg = sd->groups;
-
- WARN_ON(!sg);
-
- do {
- int cpu, max_cpu = -1;
-
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
-
- if (!(sd->flags & SD_ASYM_PACKING))
- goto next;
-
- for_each_cpu(cpu, sched_group_cpus(sg)) {
- if (max_cpu < 0)
- max_cpu = cpu;
- else if (sched_asym_prefer(cpu, max_cpu))
- max_cpu = cpu;
- }
- sg->asym_prefer_cpu = max_cpu;
-
-next:
- sg = sg->next;
- } while (sg != sd->groups);
-
- if (cpu != group_balance_cpu(sg))
- return;
-
- update_group_capacity(sd, cpu);
-}
-
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-
-static int __init setup_relax_domain_level(char *str)
-{
- if (kstrtoint(str, 0, &default_relax_domain_level))
- pr_warn("Unable to set relax_domain_level\n");
-
- return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-
-static void set_domain_attribute(struct sched_domain *sd,
- struct sched_domain_attr *attr)
-{
- int request;
-
- if (!attr || attr->relax_domain_level < 0) {
- if (default_relax_domain_level < 0)
- return;
- else
- request = default_relax_domain_level;
- } else
- request = attr->relax_domain_level;
- if (request < sd->level) {
- /* turn off idle balance on this domain */
- sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- } else {
- /* turn on idle balance on this domain */
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- }
-}
-
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
- const struct cpumask *cpu_map)
-{
- switch (what) {
- case sa_rootdomain:
- if (!atomic_read(&d->rd->refcount))
- free_rootdomain(&d->rd->rcu); /* fall through */
- case sa_sd:
- free_percpu(d->sd); /* fall through */
- case sa_sd_storage:
- __sdt_free(cpu_map); /* fall through */
- case sa_none:
- break;
- }
-}
-
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
- const struct cpumask *cpu_map)
-{
- memset(d, 0, sizeof(*d));
-
- if (__sdt_alloc(cpu_map))
- return sa_sd_storage;
- d->sd = alloc_percpu(struct sched_domain *);
- if (!d->sd)
- return sa_sd_storage;
- d->rd = alloc_rootdomain();
- if (!d->rd)
- return sa_sd;
- return sa_rootdomain;
-}
-
-/*
- * NULL the sd_data elements we've used to build the sched_domain and
- * sched_group structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
-{
- struct sd_data *sdd = sd->private;
-
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
- *per_cpu_ptr(sdd->sd, cpu) = NULL;
-
- if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
- *per_cpu_ptr(sdd->sds, cpu) = NULL;
-
- if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
- *per_cpu_ptr(sdd->sg, cpu) = NULL;
-
- if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
- *per_cpu_ptr(sdd->sgc, cpu) = NULL;
-}
-
-#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
-enum numa_topology_type sched_numa_topology_type;
-static int *sched_domains_numa_distance;
-int sched_max_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
-#endif
-
-/*
- * SD_flags allowed in topology descriptions.
- *
- * These flags are purely descriptive of the topology and do not prescribe
- * behaviour. Behaviour is artificial and mapped in the below sd_init()
- * function:
- *
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
- * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
- *
- * Odd one out, which beside describing the topology has a quirk also
- * prescribes the desired behaviour that goes along with it:
- *
- * SD_ASYM_PACKING - describes SMT quirks
- */
-#define TOPOLOGY_SD_FLAGS \
- (SD_SHARE_CPUCAPACITY | \
- SD_SHARE_PKG_RESOURCES | \
- SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_ASYM_CPUCAPACITY | \
- SD_SHARE_POWERDOMAIN)
-
-static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map,
- struct sched_domain *child, int cpu)
-{
- struct sd_data *sdd = &tl->data;
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
- int sd_id, sd_weight, sd_flags = 0;
-
-#ifdef CONFIG_NUMA
- /*
- * Ugly hack to pass state to sd_numa_mask()...
- */
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
-
- if (tl->sd_flags)
- sd_flags = (*tl->sd_flags)();
- if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
- "wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
- *sd = (struct sched_domain){
- .min_interval = sd_weight,
- .max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
-
- .cache_nice_tries = 0,
- .busy_idx = 0,
- .idle_idx = 0,
- .newidle_idx = 0,
- .wake_idx = 0,
- .forkexec_idx = 0,
-
- .flags = 1*SD_LOAD_BALANCE
- | 1*SD_BALANCE_NEWIDLE
- | 1*SD_BALANCE_EXEC
- | 1*SD_BALANCE_FORK
- | 0*SD_BALANCE_WAKE
- | 1*SD_WAKE_AFFINE
- | 0*SD_SHARE_CPUCAPACITY
- | 0*SD_SHARE_PKG_RESOURCES
- | 0*SD_SERIALIZE
- | 0*SD_PREFER_SIBLING
- | 0*SD_NUMA
- | sd_flags
- ,
-
- .last_balance = jiffies,
- .balance_interval = sd_weight,
- .smt_gain = 0,
- .max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
- .child = child,
-#ifdef CONFIG_SCHED_DEBUG
- .name = tl->name,
-#endif
- };
-
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- sd_id = cpumask_first(sched_domain_span(sd));
-
- /*
- * Convert topological properties into behaviour.
- */
-
- if (sd->flags & SD_ASYM_CPUCAPACITY) {
- struct sched_domain *t = sd;
-
- for_each_lower_domain(t)
- t->flags |= SD_BALANCE_WAKE;
- }
-
- if (sd->flags & SD_SHARE_CPUCAPACITY) {
- sd->flags |= SD_PREFER_SIBLING;
- sd->imbalance_pct = 110;
- sd->smt_gain = 1178; /* ~15% */
-
- } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->imbalance_pct = 117;
- sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
-
-#ifdef CONFIG_NUMA
- } else if (sd->flags & SD_NUMA) {
- sd->cache_nice_tries = 2;
- sd->busy_idx = 3;
- sd->idle_idx = 2;
-
- sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
- sd->flags &= ~(SD_BALANCE_EXEC |
- SD_BALANCE_FORK |
- SD_WAKE_AFFINE);
- }
-
-#endif
- } else {
- sd->flags |= SD_PREFER_SIBLING;
- sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
- sd->idle_idx = 1;
- }
-
- /*
- * For all levels sharing cache; connect a sched_domain_shared
- * instance.
- */
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
- atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
-
- sd->private = sdd;
-
- return sd;
-}
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
- { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology =
- default_topology;
-
-#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->mask; tl++)
-
-void set_sched_topology(struct sched_domain_topology_level *tl)
-{
- if (WARN_ON_ONCE(sched_smp_initialized))
- return;
-
- sched_domain_topology = tl;
-}
-
-#ifdef CONFIG_NUMA
-
-static const struct cpumask *sd_numa_mask(int cpu)
-{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
-}
-
-static void sched_numa_warn(const char *str)
-{
- static int done = false;
- int i,j;
-
- if (done)
- return;
-
- done = true;
-
- printk(KERN_WARNING "ERROR: %s\n\n", str);
-
- for (i = 0; i < nr_node_ids; i++) {
- printk(KERN_WARNING " ");
- for (j = 0; j < nr_node_ids; j++)
- printk(KERN_CONT "%02d ", node_distance(i,j));
- printk(KERN_CONT "\n");
- }
- printk(KERN_WARNING "\n");
-}
-
-bool find_numa_distance(int distance)
-{
- int i;
-
- if (distance == node_distance(0, 0))
- return true;
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- if (sched_domains_numa_distance[i] == distance)
- return true;
- }
-
- return false;
-}
-
-/*
- * A system can have three types of NUMA topology:
- * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
- * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
- * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
- *
- * The difference between a glueless mesh topology and a backplane
- * topology lies in whether communication between not directly
- * connected nodes goes through intermediary nodes (where programs
- * could run), or through backplane controllers. This affects
- * placement of programs.
- *
- * The type of topology can be discerned with the following tests:
- * - If the maximum distance between any nodes is 1 hop, the system
- * is directly connected.
- * - If for two nodes A and B, located N > 1 hops away from each other,
- * there is an intermediary node C, which is < N hops away from both
- * nodes A and B, the system is a glueless mesh.
- */
-static void init_numa_topology_type(void)
-{
- int a, b, c, n;
-
- n = sched_max_numa_distance;
-
- if (sched_domains_numa_levels <= 1) {
- sched_numa_topology_type = NUMA_DIRECT;
- return;
- }
-
- for_each_online_node(a) {
- for_each_online_node(b) {
- /* Find two nodes furthest removed from each other. */
- if (node_distance(a, b) < n)
- continue;
-
- /* Is there an intermediary node between a and b? */
- for_each_online_node(c) {
- if (node_distance(a, c) < n &&
- node_distance(b, c) < n) {
- sched_numa_topology_type =
- NUMA_GLUELESS_MESH;
- return;
- }
- }
-
- sched_numa_topology_type = NUMA_BACKPLANE;
- return;
- }
- }
-}
-
-static void sched_init_numa(void)
-{
- int next_distance, curr_distance = node_distance(0, 0);
- struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
-
- sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
-
- /*
- * O(nr_nodes^2) deduplicating selection sort -- in order to find the
- * unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
- */
- next_distance = curr_distance;
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
-
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
-
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
- }
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
- }
-
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
- }
-
- if (!level)
- return;
-
- /*
- * 'level' contains the number of unique distances, excluding the
- * identity distance node_distance(i,i).
- *
- * The sched_domains_numa_distance[] array includes the actual distance
- * numbers.
- */
-
- /*
- * Here, we should temporarily reset sched_domains_numa_levels to 0.
- * If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
- * dangerous when we use it to iterate array sched_domains_numa_masks[][]
- * in other functions.
- *
- * We reset it to 'level' at the end of this function.
- */
- sched_domains_numa_levels = 0;
-
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
- if (!sched_domains_numa_masks)
- return;
-
- /*
- * Now for each level, construct a mask per node which contains all
- * cpus of nodes that are that many hops away from us.
- */
- for (i = 0; i < level; i++) {
- sched_domains_numa_masks[i] =
- kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!sched_domains_numa_masks[i])
- return;
-
- for (j = 0; j < nr_node_ids; j++) {
- struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
- if (!mask)
- return;
-
- sched_domains_numa_masks[i][j] = mask;
-
- for_each_node(k) {
- if (node_distance(j, k) > sched_domains_numa_distance[i])
- continue;
-
- cpumask_or(mask, mask, cpumask_of_node(k));
- }
- }
- }
-
- /* Compute default topology size */
- for (i = 0; sched_domain_topology[i].mask; i++);
-
- tl = kzalloc((i + level + 1) *
- sizeof(struct sched_domain_topology_level), GFP_KERNEL);
- if (!tl)
- return;
-
- /*
- * Copy the default topology bits..
- */
- for (i = 0; sched_domain_topology[i].mask; i++)
- tl[i] = sched_domain_topology[i];
-
- /*
- * .. and append 'j' levels of NUMA goodness.
- */
- for (j = 0; j < level; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
- }
-
- sched_domain_topology = tl;
-
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
-
- init_numa_topology_type();
-}
-
-static void sched_domains_numa_masks_set(unsigned int cpu)
-{
- int node = cpu_to_node(cpu);
- int i, j;
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
- cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
- }
- }
-}
-
-static void sched_domains_numa_masks_clear(unsigned int cpu)
-{
- int i, j;
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++)
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
- }
-}
-
-#else
-static inline void sched_init_numa(void) { }
-static void sched_domains_numa_masks_set(unsigned int cpu) { }
-static void sched_domains_numa_masks_clear(unsigned int cpu) { }
-#endif /* CONFIG_NUMA */
-
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
- struct sched_domain_topology_level *tl;
- int j;
-
- for_each_sd_topology(tl) {
- struct sd_data *sdd = &tl->data;
-
- sdd->sd = alloc_percpu(struct sched_domain *);
- if (!sdd->sd)
- return -ENOMEM;
-
- sdd->sds = alloc_percpu(struct sched_domain_shared *);
- if (!sdd->sds)
- return -ENOMEM;
-
- sdd->sg = alloc_percpu(struct sched_group *);
- if (!sdd->sg)
- return -ENOMEM;
-
- sdd->sgc = alloc_percpu(struct sched_group_capacity *);
- if (!sdd->sgc)
- return -ENOMEM;
-
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
- struct sched_domain_shared *sds;
- struct sched_group *sg;
- struct sched_group_capacity *sgc;
-
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sd)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sd, j) = sd;
-
- sds = kzalloc_node(sizeof(struct sched_domain_shared),
- GFP_KERNEL, cpu_to_node(j));
- if (!sds)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sds, j) = sds;
-
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sg)
- return -ENOMEM;
-
- sg->next = sg;
-
- *per_cpu_ptr(sdd->sg, j) = sg;
-
- sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sgc)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sgc, j) = sgc;
- }
- }
-
- return 0;
-}
-
-static void __sdt_free(const struct cpumask *cpu_map)
-{
- struct sched_domain_topology_level *tl;
- int j;
-
- for_each_sd_topology(tl) {
- struct sd_data *sdd = &tl->data;
-
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
-
- if (sdd->sd) {
- sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
- free_sched_groups(sd->groups, 0);
- kfree(*per_cpu_ptr(sdd->sd, j));
- }
-
- if (sdd->sds)
- kfree(*per_cpu_ptr(sdd->sds, j));
- if (sdd->sg)
- kfree(*per_cpu_ptr(sdd->sg, j));
- if (sdd->sgc)
- kfree(*per_cpu_ptr(sdd->sgc, j));
- }
- free_percpu(sdd->sd);
- sdd->sd = NULL;
- free_percpu(sdd->sds);
- sdd->sds = NULL;
- free_percpu(sdd->sg);
- sdd->sg = NULL;
- free_percpu(sdd->sgc);
- sdd->sgc = NULL;
- }
-}
-
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int cpu)
-{
- struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
-
- if (child) {
- sd->level = child->level + 1;
- sched_domain_level_max = max(sched_domain_level_max, sd->level);
- child->parent = sd;
-
- if (!cpumask_subset(sched_domain_span(child),
- sched_domain_span(sd))) {
- pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
- pr_err(" the %s domain not a subset of the %s domain\n",
- child->name, sd->name);
-#endif
- /* Fixup, ensure @sd has at least @child cpus. */
- cpumask_or(sched_domain_span(sd),
- sched_domain_span(sd),
- sched_domain_span(child));
- }
-
- }
- set_domain_attribute(sd, attr);
-
- return sd;
-}
-
/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
+ * used to mark begin/end of suspend/resume:
*/
-static int build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
-{
- enum s_alloc alloc_state;
- struct sched_domain *sd;
- struct s_data d;
- struct rq *rq = NULL;
- int i, ret = -ENOMEM;
-
- alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
- if (alloc_state != sa_rootdomain)
- goto error;
-
- /* Set up domains for cpus specified by the cpu_map. */
- for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl;
-
- sd = NULL;
- for_each_sd_topology(tl) {
- sd = build_sched_domain(tl, cpu_map, attr, sd, i);
- if (tl == sched_domain_topology)
- *per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
- sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
- }
- }
-
- /* Build the groups for the domains */
- for_each_cpu(i, cpu_map) {
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
- if (build_overlap_sched_groups(sd, i))
- goto error;
- } else {
- if (build_sched_groups(sd, i))
- goto error;
- }
- }
- }
-
- /* Calculate CPU capacity for physical packages and nodes */
- for (i = nr_cpumask_bits-1; i >= 0; i--) {
- if (!cpumask_test_cpu(i, cpu_map))
- continue;
-
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- claim_allocations(i, sd);
- init_sched_groups_capacity(i, sd);
- }
- }
-
- /* Attach the domains */
- rcu_read_lock();
- for_each_cpu(i, cpu_map) {
- rq = cpu_rq(i);
- sd = *per_cpu_ptr(d.sd, i);
-
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
-
- cpu_attach_domain(sd, d.rd, i);
- }
- rcu_read_unlock();
-
- if (rq && sched_debug_enabled) {
- pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
-
- ret = 0;
-error:
- __free_domain_allocs(&d, alloc_state, cpu_map);
- return ret;
-}
-
-static cpumask_var_t *doms_cur; /* current sched domains */
-static int ndoms_cur; /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
- /* attribues of custom domains in 'doms_cur' */
-
-/*
- * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t fallback_doms;
-
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __weak arch_update_cpu_topology(void)
-{
- return 0;
-}
-
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
- int i;
- cpumask_var_t *doms;
-
- doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
- if (!doms)
- return NULL;
- for (i = 0; i < ndoms; i++) {
- if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
- free_sched_domains(doms, i);
- return NULL;
- }
- }
- return doms;
-}
-
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
- unsigned int i;
- for (i = 0; i < ndoms; i++)
- free_cpumask_var(doms[i]);
- kfree(doms);
-}
-
-/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
-static int init_sched_domains(const struct cpumask *cpu_map)
-{
- int err;
-
- arch_update_cpu_topology();
- ndoms_cur = 1;
- doms_cur = alloc_sched_domains(ndoms_cur);
- if (!doms_cur)
- doms_cur = &fallback_doms;
- cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- err = build_sched_domains(doms_cur[0], NULL);
- register_sched_domain_sysctl();
-
- return err;
-}
-
-/*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
- */
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
- int i;
-
- rcu_read_lock();
- for_each_cpu(i, cpu_map)
- cpu_attach_domain(NULL, &def_root_domain, i);
- rcu_read_unlock();
-}
-
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
- struct sched_domain_attr *new, int idx_new)
-{
- struct sched_domain_attr tmp;
-
- /* fast path */
- if (!new && !cur)
- return 1;
-
- tmp = SD_ATTR_INIT;
- return !memcmp(cur ? (cur + idx_cur) : &tmp,
- new ? (new + idx_new) : &tmp,
- sizeof(struct sched_domain_attr));
-}
-
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains. This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
-{
- int i, j, n;
- int new_topology;
-
- mutex_lock(&sched_domains_mutex);
-
- /* always unregister in case we don't destroy any domains */
- unregister_sched_domain_sysctl();
-
- /* Let architecture update cpu core mappings. */
- new_topology = arch_update_cpu_topology();
-
- n = doms_new ? ndoms_new : 0;
-
- /* Destroy deleted domains */
- for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_cur[i], doms_new[j])
- && dattrs_equal(dattr_cur, i, dattr_new, j))
- goto match1;
- }
- /* no match - a current sched domain not in new doms_new[] */
- detach_destroy_domains(doms_cur[i]);
-match1:
- ;
- }
-
- n = ndoms_cur;
- if (doms_new == NULL) {
- n = 0;
- doms_new = &fallback_doms;
- cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
- WARN_ON_ONCE(dattr_new);
- }
-
- /* Build new domains */
- for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_new[i], doms_cur[j])
- && dattrs_equal(dattr_new, i, dattr_cur, j))
- goto match2;
- }
- /* no match - add a new doms_new */
- build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
- ;
- }
-
- /* Remember the new sched domains */
- if (doms_cur != &fallback_doms)
- free_sched_domains(doms_cur, ndoms_cur);
- kfree(dattr_cur); /* kfree(NULL) is safe */
- doms_cur = doms_new;
- dattr_cur = dattr_new;
- ndoms_cur = ndoms_new;
-
- register_sched_domain_sysctl();
-
- mutex_unlock(&sched_domains_mutex);
-}
-
-static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+static int num_cpus_frozen;
/*
* Update cpusets according to cpu_active mask. If cpusets are
@@ -7352,7 +5759,7 @@ int sched_cpu_activate(unsigned int cpu)
* Put the rq online, if not already. This happens:
*
* 1) In the early boot process, because we build the real domains
- * after all cpus have been brought up.
+ * after all CPUs have been brought up.
*
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
@@ -7467,7 +5874,7 @@ void __init sched_init_smp(void)
/*
* There's no userspace yet to cause hotplug operations; hence all the
- * cpu masks are stable and all blatant races in the below code cannot
+ * CPU masks are stable and all blatant races in the below code cannot
* happen.
*/
mutex_lock(&sched_domains_mutex);
@@ -7487,6 +5894,7 @@ void __init sched_init_smp(void)
init_sched_dl_class();
sched_init_smt();
+ sched_clock_init_late();
sched_smp_initialized = true;
}
@@ -7502,6 +5910,7 @@ early_initcall(migration_init);
void __init sched_init_smp(void)
{
sched_init_granularity();
+ sched_clock_init_late();
}
#endif /* CONFIG_SMP */
@@ -7545,6 +5954,8 @@ void __init sched_init(void)
int i, j;
unsigned long alloc_size = 0, ptr;
+ sched_clock_init();
+
for (i = 0; i < WAIT_TABLE_SIZE; i++)
init_waitqueue_head(bit_wait_table + i);
@@ -7583,10 +5994,8 @@ void __init sched_init(void)
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
- init_rt_bandwidth(&def_rt_bandwidth,
- global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth,
- global_rt_period(), global_rt_runtime());
+ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
+ init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
@@ -7622,18 +6031,18 @@ void __init sched_init(void)
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
- * How much cpu bandwidth does root_task_group get?
+ * How much CPU bandwidth does root_task_group get?
*
* In case of task-groups formed thr' the cgroup filesystem, it
- * gets 100% of the cpu resources in the system. This overall
- * system cpu resource is divided among the tasks of
+ * gets 100% of the CPU resources in the system. This overall
+ * system CPU resource is divided among the tasks of
* root_task_group and its child task-groups in a fair manner,
* based on each entity's (task or task-group's) weight
* (se->load.weight).
*
* In other words, if root_task_group has 10 tasks of weight
* 1024) and two child groups A0 and A1 (of weight 1024 each),
- * then A0's share of the cpu resource is:
+ * then A0's share of the CPU resource is:
*
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
@@ -7742,10 +6151,14 @@ EXPORT_SYMBOL(__might_sleep);
void ___might_sleep(const char *file, int line, int preempt_offset)
{
- static unsigned long prev_jiffy; /* ratelimiting */
+ /* Ratelimiting timestamp: */
+ static unsigned long prev_jiffy;
+
unsigned long preempt_disable_ip;
- rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+ /* WARN_ON_ONCE() by default, no rate limit required: */
+ rcu_sleep_check();
+
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
@@ -7754,7 +6167,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
return;
prev_jiffy = jiffies;
- /* Save this before calling printk(), since that will clobber it */
+ /* Save this before calling printk(), since that will clobber it: */
preempt_disable_ip = get_preempt_disable_ip(current);
printk(KERN_ERR
@@ -7833,7 +6246,7 @@ void normalize_rt_tasks(void)
*/
/**
- * curr_task - return the current task for a given cpu.
+ * curr_task - return the current task for a given CPU.
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
@@ -7849,13 +6262,13 @@ struct task_struct *curr_task(int cpu)
#ifdef CONFIG_IA64
/**
- * set_curr_task - set the current task for a given cpu.
+ * set_curr_task - set the current task for a given CPU.
* @cpu: the processor in question.
* @p: the task pointer to set.
*
* Description: This function must only be used when non-maskable interrupts
* are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner. This function
+ * notion of the current task on a CPU in a non-blocking manner. This function
* must be called with all CPU's synchronized, and interrupts disabled, the
* and caller must save the original value of the current task (see
* curr_task() above) and restore that value before reenabling interrupts and
@@ -7911,7 +6324,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
- WARN_ON(!parent); /* root should already exist */
+ /* Root should already exist: */
+ WARN_ON(!parent);
tg->parent = parent;
INIT_LIST_HEAD(&tg->children);
@@ -7924,13 +6338,13 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
/* rcu callback to free various structures associated with a task group */
static void sched_free_group_rcu(struct rcu_head *rhp)
{
- /* now it should be safe to free those cfs_rqs */
+ /* Now it should be safe to free those cfs_rqs: */
sched_free_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
- /* wait for possible concurrent references to cfs_rqs complete */
+ /* Wait for possible concurrent references to cfs_rqs complete: */
call_rcu(&tg->rcu, sched_free_group_rcu);
}
@@ -7938,7 +6352,7 @@ void sched_offline_group(struct task_group *tg)
{
unsigned long flags;
- /* end participation in shares distribution */
+ /* End participation in shares distribution: */
unregister_fair_sched_group(tg);
spin_lock_irqsave(&task_group_lock, flags);
@@ -7983,20 +6397,21 @@ void sched_move_task(struct task_struct *tsk)
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
+ update_rq_clock(rq);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
+ if (running)
put_prev_task(rq, tsk);
sched_change_group(tsk, TASK_MOVE_GROUP);
if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
- if (unlikely(running))
+ if (running)
set_curr_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf);
@@ -8366,11 +6781,14 @@ int sched_rr_handler(struct ctl_table *table, int write,
mutex_lock(&mutex);
ret = proc_dointvec(table, write, buffer, lenp, ppos);
- /* make sure that internally we keep jiffies */
- /* also, writing zero resets timeslice to default */
+ /*
+ * Make sure that internally we keep jiffies.
+ * Also, writing zero resets the timeslice to default:
+ */
if (!ret && write) {
- sched_rr_timeslice = sched_rr_timeslice <= 0 ?
- RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ sched_rr_timeslice =
+ sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+ msecs_to_jiffies(sysctl_sched_rr_timeslice);
}
mutex_unlock(&mutex);
return ret;
@@ -8431,6 +6849,7 @@ static void cpu_cgroup_fork(struct task_struct *task)
rq = task_rq_lock(task, &rf);
+ update_rq_clock(rq);
sched_change_group(task, TASK_SET_GROUP);
task_rq_unlock(rq, task, &rf);
@@ -8550,9 +6969,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
cfs_b->quota = quota;
__refill_cfs_bandwidth_runtime(cfs_b);
- /* restart the period timer (if active) to handle new period expiry */
+
+ /* Restart the period timer (if active) to handle new period expiry: */
if (runtime_enabled)
start_cfs_bandwidth(cfs_b);
+
raw_spin_unlock_irq(&cfs_b->lock);
for_each_online_cpu(i) {
@@ -8690,8 +7111,8 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
parent_quota = parent_b->hierarchical_quota;
/*
- * ensure max(child_quota) <= parent_quota, inherit when no
- * limit is set
+ * Ensure max(child_quota) <= parent_quota, inherit when no
+ * limit is set:
*/
if (quota == RUNTIME_INF)
quota = parent_quota;
@@ -8800,7 +7221,7 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
- { } /* terminate */
+ { } /* Terminate */
};
struct cgroup_subsys cpu_cgrp_subsys = {
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9add206b5608..f95ab29a45d0 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n",
cpuacct_stat_desc[stat],
- (long long)cputime64_to_clock_t(val[stat]));
+ (long long)nsec_to_clock_t(val[stat]));
}
return 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7700a9cba335..2ecec3a4f1ee 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
+#include <linux/cputime.h>
#include "sched.h"
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
@@ -44,6 +45,7 @@ void disable_sched_clock_irqtime(void)
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -61,49 +63,34 @@ void irqtime_account_irq(struct task_struct *curr)
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count())
- irqtime->hardirq_time += delta;
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- irqtime->softirq_time += delta;
+ if (hardirq_count()) {
+ cpustat[CPUTIME_IRQ] += delta;
+ irqtime->tick_delta += delta;
+ } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
+ cpustat[CPUTIME_SOFTIRQ] += delta;
+ irqtime->tick_delta += delta;
+ }
u64_stats_update_end(&irqtime->sync);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
+static u64 irqtime_tick_accounted(u64 maxtime)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- cputime_t irq_cputime;
-
- irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
- irq_cputime = min(irq_cputime, maxtime);
- cpustat[idx] += irq_cputime;
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ u64 delta;
- return irq_cputime;
-}
+ delta = min(irqtime->tick_delta, maxtime);
+ irqtime->tick_delta -= delta;
-static cputime_t irqtime_account_hi_update(cputime_t maxtime)
-{
- return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
- CPUTIME_IRQ, maxtime);
-}
-
-static cputime_t irqtime_account_si_update(cputime_t maxtime)
-{
- return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
- CPUTIME_SOFTIRQ, maxtime);
+ return delta;
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
#define sched_clock_irqtime (0)
-static cputime_t irqtime_account_hi_update(cputime_t dummy)
-{
- return 0;
-}
-
-static cputime_t irqtime_account_si_update(cputime_t dummy)
+static u64 irqtime_tick_accounted(u64 dummy)
{
return 0;
}
@@ -129,7 +116,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in user space since the last update
*/
-void account_user_time(struct task_struct *p, cputime_t cputime)
+void account_user_time(struct task_struct *p, u64 cputime)
{
int index;
@@ -140,7 +127,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
+ task_group_account_field(p, index, cputime);
/* Account for user time used */
acct_account_cputime(p);
@@ -151,7 +138,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in virtual machine since the last update
*/
-static void account_guest_time(struct task_struct *p, cputime_t cputime)
+void account_guest_time(struct task_struct *p, u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
@@ -162,11 +149,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+ cpustat[CPUTIME_NICE] += cputime;
+ cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+ cpustat[CPUTIME_USER] += cputime;
+ cpustat[CPUTIME_GUEST] += cputime;
}
}
@@ -176,15 +163,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
* @cputime: the cpu time spent in kernel space since the last update
* @index: pointer to cpustat field that has to be updated
*/
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
+void account_system_index_time(struct task_struct *p,
+ u64 cputime, enum cpu_usage_stat index)
{
/* Add system time to process. */
p->stime += cputime;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
+ task_group_account_field(p, index, cputime);
/* Account for system time used */
acct_account_cputime(p);
@@ -196,8 +183,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
* @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the cpu time spent in kernel space since the last update
*/
-void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime)
+void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{
int index;
@@ -213,33 +199,33 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
index = CPUTIME_SYSTEM;
- __account_system_time(p, cputime, index);
+ account_system_index_time(p, cputime, index);
}
/*
* Account for involuntary wait time.
* @cputime: the cpu time spent in involuntary wait
*/
-void account_steal_time(cputime_t cputime)
+void account_steal_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
- cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+ cpustat[CPUTIME_STEAL] += cputime;
}
/*
* Account for idle time.
* @cputime: the cpu time spent in idle wait
*/
-void account_idle_time(cputime_t cputime)
+void account_idle_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
struct rq *rq = this_rq();
if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+ cpustat[CPUTIME_IOWAIT] += cputime;
else
- cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+ cpustat[CPUTIME_IDLE] += cputime;
}
/*
@@ -247,21 +233,19 @@ void account_idle_time(cputime_t cputime)
* ticks are not redelivered later. Due to that, this function may on
* occasion account more time than the calling functions think elapsed.
*/
-static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
+static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
- cputime_t steal_cputime;
u64 steal;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
+ steal = min(steal, maxtime);
+ account_steal_time(steal);
+ this_rq()->prev_steal_time += steal;
- steal_cputime = min(nsecs_to_cputime(steal), maxtime);
- account_steal_time(steal_cputime);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
-
- return steal_cputime;
+ return steal;
}
#endif
return 0;
@@ -270,9 +254,9 @@ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
/*
* Account how much elapsed time was spent in steal, irq, or softirq time.
*/
-static inline cputime_t account_other_time(cputime_t max)
+static inline u64 account_other_time(u64 max)
{
- cputime_t accounted;
+ u64 accounted;
/* Shall be converted to a lockdep-enabled lightweight check */
WARN_ON_ONCE(!irqs_disabled());
@@ -280,10 +264,7 @@ static inline cputime_t account_other_time(cputime_t max)
accounted = steal_account_process_time(max);
if (accounted < max)
- accounted += irqtime_account_hi_update(max - accounted);
-
- if (accounted < max)
- accounted += irqtime_account_si_update(max - accounted);
+ accounted += irqtime_tick_accounted(max - accounted);
return accounted;
}
@@ -315,7 +296,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
- cputime_t utime, stime;
+ u64 utime, stime;
struct task_struct *t;
unsigned int seq, nextseq;
unsigned long flags;
@@ -379,8 +360,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
- u64 cputime = (__force u64) cputime_one_jiffy * ticks;
- cputime_t other;
+ u64 other, cputime = TICK_NSEC * ticks;
/*
* When returning from idle, many ticks can get accounted at
@@ -392,6 +372,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
other = account_other_time(ULONG_MAX);
if (other >= cputime)
return;
+
cputime -= other;
if (this_cpu_ksoftirqd() == p) {
@@ -400,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime, CPUTIME_SOFTIRQ);
+ account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime);
} else if (p == rq->idle) {
@@ -408,7 +389,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime);
} else {
- __account_system_time(p, cputime, CPUTIME_SYSTEM);
+ account_system_index_time(p, cputime, CPUTIME_SYSTEM);
}
}
@@ -437,9 +418,7 @@ void vtime_common_task_switch(struct task_struct *prev)
else
vtime_account_system(prev);
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- vtime_account_user(prev);
-#endif
+ vtime_flush(prev);
arch_vtime_task_switch(prev);
}
#endif
@@ -467,14 +446,14 @@ void vtime_account_irq_enter(struct task_struct *tsk)
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
*ut = p->utime;
*st = p->stime;
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime;
@@ -491,7 +470,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t cputime, steal;
+ u64 cputime, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
@@ -502,7 +481,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
- cputime = cputime_one_jiffy;
+ cputime = TICK_NSEC;
steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
@@ -524,14 +503,14 @@ void account_process_tick(struct task_struct *p, int user_tick)
*/
void account_idle_ticks(unsigned long ticks)
{
- cputime_t cputime, steal;
+ u64 cputime, steal;
if (sched_clock_irqtime) {
irqtime_account_idle_ticks(ticks);
return;
}
- cputime = jiffies_to_cputime(ticks);
+ cputime = ticks * TICK_NSEC;
steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
@@ -545,7 +524,7 @@ void account_idle_ticks(unsigned long ticks)
* Perform (stime * rtime) / total, but avoid multiplication overflow by
* loosing precision when the numbers are big.
*/
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 scaled;
@@ -582,7 +561,7 @@ drop_precision:
* followed by a 64/32->64 divide.
*/
scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return (__force cputime_t) scaled;
+ return scaled;
}
/*
@@ -607,14 +586,14 @@ drop_precision:
*/
static void cputime_adjust(struct task_cputime *curr,
struct prev_cputime *prev,
- cputime_t *ut, cputime_t *st)
+ u64 *ut, u64 *st)
{
- cputime_t rtime, stime, utime;
+ u64 rtime, stime, utime;
unsigned long flags;
/* Serialize concurrent callers such that we can honour our guarantees */
raw_spin_lock_irqsave(&prev->lock, flags);
- rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+ rtime = curr->sum_exec_runtime;
/*
* This is possible under two circumstances:
@@ -645,8 +624,7 @@ static void cputime_adjust(struct task_cputime *curr,
goto update;
}
- stime = scale_stime((__force u64)stime, (__force u64)rtime,
- (__force u64)(stime + utime));
+ stime = scale_stime(stime, rtime, stime + utime);
update:
/*
@@ -679,7 +657,7 @@ out:
raw_spin_unlock_irqrestore(&prev->lock, flags);
}
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime = {
.sum_exec_runtime = p->se.sum_exec_runtime,
@@ -690,7 +668,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime;
@@ -700,20 +678,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static cputime_t vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
if (time_before(now, (unsigned long)tsk->vtime_snap))
return 0;
- return jiffies_to_cputime(now - tsk->vtime_snap);
+ return jiffies_to_nsecs(now - tsk->vtime_snap);
}
-static cputime_t get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
- cputime_t delta, other;
+ u64 delta, other;
/*
* Unlike tick based timing, vtime based timing never has lost
@@ -722,7 +700,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
* elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative.
*/
- delta = jiffies_to_cputime(now - tsk->vtime_snap);
+ delta = jiffies_to_nsecs(now - tsk->vtime_snap);
other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now;
@@ -732,9 +710,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
static void __vtime_account_system(struct task_struct *tsk)
{
- cputime_t delta_cpu = get_vtime_delta(tsk);
-
- account_system_time(tsk, irq_count(), delta_cpu);
+ account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
}
void vtime_account_system(struct task_struct *tsk)
@@ -749,14 +725,10 @@ void vtime_account_system(struct task_struct *tsk)
void vtime_account_user(struct task_struct *tsk)
{
- cputime_t delta_cpu;
-
write_seqcount_begin(&tsk->vtime_seqcount);
tsk->vtime_snap_whence = VTIME_SYS;
- if (vtime_delta(tsk)) {
- delta_cpu = get_vtime_delta(tsk);
- account_user_time(tsk, delta_cpu);
- }
+ if (vtime_delta(tsk))
+ account_user_time(tsk, get_vtime_delta(tsk));
write_seqcount_end(&tsk->vtime_seqcount);
}
@@ -797,9 +769,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
- cputime_t delta_cpu = get_vtime_delta(tsk);
-
- account_idle_time(delta_cpu);
+ account_idle_time(get_vtime_delta(tsk));
}
void arch_vtime_task_switch(struct task_struct *prev)
@@ -826,10 +796,10 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_restore(flags);
}
-cputime_t task_gtime(struct task_struct *t)
+u64 task_gtime(struct task_struct *t)
{
unsigned int seq;
- cputime_t gtime;
+ u64 gtime;
if (!vtime_accounting_enabled())
return t->gtime;
@@ -851,9 +821,9 @@ cputime_t task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
-void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
- cputime_t delta;
+ u64 delta;
unsigned int seq;
if (!vtime_accounting_enabled()) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 70ef2b1901e4..27737f34757d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -663,9 +663,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* Nothing relies on rq->lock after this, so its safe to drop
* rq->lock.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq_unpin_lock(rq, &rf);
push_dl_task(rq);
- lockdep_repin_lock(&rq->lock, rf.cookie);
+ rq_repin_lock(rq, &rf);
}
#endif
@@ -1118,7 +1118,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
}
struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
struct task_struct *p;
@@ -1133,9 +1133,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
* disabled avoiding further scheduler activity on it and we're
* being very careful to re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
pull_dl_task(rq);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
/*
* pull_dl_task() can drop (and re-acquire) rq->lock; this
* means a stop task can slip in, in which case we need to
@@ -1729,12 +1729,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
#ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
-#else
+#endif
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
-#endif
}
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index fa178b62ea79..109adc0e9cb9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -953,6 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
#endif
P(policy);
P(prio);
+ if (p->policy == SCHED_DEADLINE) {
+ P(dl.runtime);
+ P(dl.deadline);
+ }
#undef PN_SCHEDSTAT
#undef PN
#undef __PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6559d197e08a..274c747a01ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2657,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
if (tg_weight)
shares /= tg_weight;
+ /*
+ * MIN_SHARES has to be unscaled here to support per-CPU partitioning
+ * of a group with small tg->shares value. It is a floor value which is
+ * assigned as a minimum load.weight to the sched_entity representing
+ * the group on a CPU.
+ *
+ * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
+ * on an 8-core system with 8 tasks each runnable on one CPU shares has
+ * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
+ * case no task is runnable on a CPU MIN_SHARES=2 should be returned
+ * instead of 0.
+ */
if (shares < MIN_SHARES)
shares = MIN_SHARES;
if (shares > tg->shares)
@@ -2689,16 +2701,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
+ return;
+
+ if (throttled_hierarchy(cfs_rq))
return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2707,8 +2723,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3424,7 +3441,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int idle_balance(struct rq *this_rq);
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
@@ -3453,7 +3470,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int idle_balance(struct rq *rq)
+static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -3582,10 +3599,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ /*
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
+ */
update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
@@ -3657,6 +3682,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
@@ -3681,7 +3715,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3898,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* Ensure that runnable average is periodically updated.
*/
update_load_avg(curr, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4761,7 +4795,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
update_load_avg(se, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
if (!se)
@@ -4820,7 +4854,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
update_load_avg(se, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
if (!se)
@@ -6213,7 +6247,7 @@ preempt:
}
static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
@@ -6320,15 +6354,8 @@ simple:
return p;
idle:
- /*
- * This is OK, because current is on_cpu, which avoids it being picked
- * for load-balance and preemption/IRQs are still disabled avoiding
- * further scheduler activity on it and we're being very careful to
- * re-start the picking loop.
- */
- lockdep_unpin_lock(&rq->lock, cookie);
- new_tasks = idle_balance(rq);
- lockdep_repin_lock(&rq->lock, cookie);
+ new_tasks = idle_balance(rq, rf);
+
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
@@ -8077,6 +8104,7 @@ redo:
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
+ update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
@@ -8297,7 +8325,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static int idle_balance(struct rq *this_rq)
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -8311,6 +8339,14 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ rq_unpin_lock(this_rq, rf);
+
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
rcu_read_lock();
@@ -8388,6 +8424,8 @@ out:
if (pulled_task)
this_rq->idle_stamp = 0;
+ rq_repin_lock(this_rq, rf);
+
return pulled_task;
}
@@ -8443,6 +8481,7 @@ static int active_load_balance_cpu_stop(void *data)
};
schedstat_inc(sd->alb_count);
+ update_rq_clock(busiest_rq);
p = detach_one_task(&env);
if (p) {
@@ -9264,6 +9303,7 @@ void online_fair_sched_group(struct task_group *tg)
se = tg->se[i];
raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
raw_spin_unlock_irq(&rq->lock);
@@ -9356,8 +9396,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 5405d3feb112..0c00172db63e 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
}
static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
update_idle_core(rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2516b8df6dbb..e8836cfc4cdb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -1523,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
}
static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
@@ -1535,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
* disabled avoiding further scheduler activity on it and we're
* being very careful to re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
pull_rt_task(rq);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a dl or stop task can slip in, in which case we need
@@ -2198,10 +2199,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
#ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
-#else
+#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio)
resched_curr(rq);
-#endif /* CONFIG_SMP */
}
}
@@ -2246,6 +2246,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
}
}
+#ifdef CONFIG_POSIX_TIMERS
static void watchdog(struct rq *rq, struct task_struct *p)
{
unsigned long soft, hard;
@@ -2267,6 +2268,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
}
}
+#else
+static inline void watchdog(struct rq *rq, struct task_struct *p) { }
+#endif
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b34c7826ca5..71b10a9b73cf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4,6 +4,7 @@
#include <linux/sched/rt.h>
#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
+#include <linux/kernel_stat.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
@@ -222,7 +223,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
-extern struct mutex sched_domains_mutex;
+extern void init_dl_bw(struct dl_bw *dl_b);
#ifdef CONFIG_CGROUP_SCHED
@@ -583,6 +584,13 @@ struct root_domain {
};
extern struct root_domain def_root_domain;
+extern struct mutex sched_domains_mutex;
+extern cpumask_var_t fallback_doms;
+extern cpumask_var_t sched_domains_tmpmask;
+
+extern void init_defrootdomain(void);
+extern int init_sched_domains(const struct cpumask *cpu_map);
+extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
#endif /* CONFIG_SMP */
@@ -644,7 +652,7 @@ struct rq {
unsigned long next_balance;
struct mm_struct *prev_mm;
- unsigned int clock_skip_update;
+ unsigned int clock_update_flags;
u64 clock;
u64 clock_task;
@@ -768,28 +776,110 @@ static inline u64 __rq_clock_broken(struct rq *rq)
return READ_ONCE(rq->clock);
}
+/*
+ * rq::clock_update_flags bits
+ *
+ * %RQCF_REQ_SKIP - will request skipping of clock update on the next
+ * call to __schedule(). This is an optimisation to avoid
+ * neighbouring rq clock updates.
+ *
+ * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
+ * in effect and calls to update_rq_clock() are being ignored.
+ *
+ * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
+ * made to update_rq_clock() since the last time rq::lock was pinned.
+ *
+ * If inside of __schedule(), clock_update_flags will have been
+ * shifted left (a left shift is a cheap operation for the fast path
+ * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
+ *
+ * if (rq-clock_update_flags >= RQCF_UPDATED)
+ *
+ * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * one position though, because the next rq_unpin_lock() will shift it
+ * back.
+ */
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+#define RQCF_UPDATED 0x04
+
+static inline void assert_clock_updated(struct rq *rq)
+{
+ /*
+ * The only reason for not seeing a clock update since the
+ * last rq_pin_lock() is if we're currently skipping updates.
+ */
+ SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
+}
+
static inline u64 rq_clock(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
return rq->clock_task;
}
-#define RQCF_REQ_SKIP 0x01
-#define RQCF_ACT_SKIP 0x02
-
static inline void rq_clock_skip_update(struct rq *rq, bool skip)
{
lockdep_assert_held(&rq->lock);
if (skip)
- rq->clock_skip_update |= RQCF_REQ_SKIP;
+ rq->clock_update_flags |= RQCF_REQ_SKIP;
else
- rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+ rq->clock_update_flags &= ~RQCF_REQ_SKIP;
+}
+
+struct rq_flags {
+ unsigned long flags;
+ struct pin_cookie cookie;
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
+ * current pin context is stashed here in case it needs to be
+ * restored in rq_repin_lock().
+ */
+ unsigned int clock_update_flags;
+#endif
+};
+
+static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ rf->cookie = lockdep_pin_lock(&rq->lock);
+
+#ifdef CONFIG_SCHED_DEBUG
+ rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rf->clock_update_flags = 0;
+#endif
+}
+
+static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ if (rq->clock_update_flags > RQCF_ACT_SKIP)
+ rf->clock_update_flags = RQCF_UPDATED;
+#endif
+
+ lockdep_unpin_lock(&rq->lock, rf->cookie);
+}
+
+static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ lockdep_repin_lock(&rq->lock, rf->cookie);
+
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * Restore the value we stashed in @rf for this pin context.
+ */
+ rq->clock_update_flags |= rf->clock_update_flags;
+#endif
}
#ifdef CONFIG_NUMA
@@ -803,6 +893,16 @@ extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
#endif
+#ifdef CONFIG_NUMA
+extern void sched_init_numa(void);
+extern void sched_domains_numa_masks_set(unsigned int cpu);
+extern void sched_domains_numa_masks_clear(unsigned int cpu);
+#else
+static inline void sched_init_numa(void) { }
+static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
/* The regions in numa_faults array from task_struct */
enum numa_faults_stats {
@@ -969,7 +1069,7 @@ static inline void sched_ttwu_pending(void) { }
#endif /* CONFIG_SMP */
#include "stats.h"
-#include "auto_group.h"
+#include "autogroup.h"
#ifdef CONFIG_CGROUP_SCHED
@@ -1245,7 +1345,7 @@ struct sched_class {
*/
struct task_struct * (*pick_next_task) (struct rq *rq,
struct task_struct *prev,
- struct pin_cookie cookie);
+ struct rq_flags *rf);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
@@ -1501,11 +1601,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
-struct rq_flags {
- unsigned long flags;
- struct pin_cookie cookie;
-};
-
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -1515,7 +1610,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ rq_unpin_lock(rq, rf);
raw_spin_unlock(&rq->lock);
}
@@ -1524,7 +1619,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ rq_unpin_lock(rq, rf);
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1674,6 +1769,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
+extern void set_rq_online (struct rq *rq);
+extern void set_rq_offline(struct rq *rq);
+extern bool sched_smp_initialized;
+
#else /* CONFIG_SMP */
/*
@@ -1750,8 +1849,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
- u64 hardirq_time;
- u64 softirq_time;
+ u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync sync;
};
@@ -1761,12 +1859,13 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
static inline u64 irq_time_read(int cpu)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
- total = irqtime->softirq_time + irqtime->hardirq_time;
+ total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 34659a853505..bf0da0aa0a14 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -172,18 +172,19 @@ sched_info_switch(struct rq *rq,
*/
/**
- * cputimer_running - return true if cputimer is running
+ * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
*
* @tsk: Pointer to target task.
*/
-static inline bool cputimer_running(struct task_struct *tsk)
-
+#ifdef CONFIG_POSIX_TIMERS
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
/* Check if cputimer isn't running. This is accessed without locking. */
if (!READ_ONCE(cputimer->running))
- return false;
+ return NULL;
/*
* After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
@@ -200,10 +201,17 @@ static inline bool cputimer_running(struct task_struct *tsk)
* clock delta is behind the expiring timer value.
*/
if (unlikely(!tsk->sighand))
- return false;
+ return NULL;
- return true;
+ return cputimer;
+}
+#else
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+ return NULL;
}
+#endif
/**
* account_group_user_time - Maintain utime for a thread group.
@@ -216,11 +224,11 @@ static inline bool cputimer_running(struct task_struct *tsk)
* running CPU and update the utime field there.
*/
static inline void account_group_user_time(struct task_struct *tsk,
- cputime_t cputime)
+ u64 cputime)
{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
- if (!cputimer_running(tsk))
+ if (!cputimer)
return;
atomic64_add(cputime, &cputimer->cputime_atomic.utime);
@@ -237,11 +245,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
* running CPU and update the stime field there.
*/
static inline void account_group_system_time(struct task_struct *tsk,
- cputime_t cputime)
+ u64 cputime)
{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
- if (!cputimer_running(tsk))
+ if (!cputimer)
return;
atomic64_add(cputime, &cputimer->cputime_atomic.stime);
@@ -260,9 +268,9 @@ static inline void account_group_system_time(struct task_struct *tsk,
static inline void account_group_exec_runtime(struct task_struct *tsk,
unsigned long long ns)
{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
- if (!cputimer_running(tsk))
+ if (!cputimer)
return;
atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 604297a08b3a..9f69fb630853 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
}
static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *stop = rq->stop;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
new file mode 100644
index 000000000000..1b0b4fb12837
--- /dev/null
+++ b/kernel/sched/topology.c
@@ -0,0 +1,1658 @@
+/*
+ * Scheduler topology setup/handling methods
+ */
+#include <linux/sched.h>
+#include <linux/mutex.h>
+
+#include "sched.h"
+
+DEFINE_MUTEX(sched_domains_mutex);
+
+/* Protected by sched_domains_mutex: */
+cpumask_var_t sched_domains_tmpmask;
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+ sched_debug_enabled = 1;
+
+ return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+ struct cpumask *groupmask)
+{
+ struct sched_group *group = sd->groups;
+
+ cpumask_clear(groupmask);
+
+ printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+ if (sd->parent)
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
+ return -1;
+ }
+
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
+ }
+ if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not contain"
+ " CPU%d\n", cpu);
+ }
+
+ printk(KERN_DEBUG "%*s groups:", level + 1, "");
+ do {
+ if (!group) {
+ printk("\n");
+ printk(KERN_ERR "ERROR: group is NULL\n");
+ break;
+ }
+
+ if (!cpumask_weight(sched_group_cpus(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: empty group\n");
+ break;
+ }
+
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: repeated CPUs\n");
+ break;
+ }
+
+ cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+
+ printk(KERN_CONT " %*pbl",
+ cpumask_pr_args(sched_group_cpus(group)));
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %lu)",
+ group->sgc->capacity);
+ }
+
+ group = group->next;
+ } while (group != sd->groups);
+ printk(KERN_CONT "\n");
+
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent &&
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
+ return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ int level = 0;
+
+ if (!sched_debug_enabled)
+ return;
+
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
+ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+ break;
+ level++;
+ sd = sd->parent;
+ if (!sd)
+ break;
+ }
+}
+#else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if (sd->flags & (SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUCAPACITY |
+ SD_ASYM_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
+ if (sd->groups != sd->groups->next)
+ return 0;
+ }
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_AFFINE))
+ return 0;
+
+ return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+ return 0;
+
+ /* Flags needing groups don't count if only 1 group in parent */
+ if (parent->groups == parent->groups->next) {
+ pflags &= ~(SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
+ if (nr_node_ids == 1)
+ pflags &= ~SD_SERIALIZE;
+ }
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+ cpupri_cleanup(&rd->cpupri);
+ cpudl_cleanup(&rd->cpudl);
+ free_cpumask_var(rd->dlo_mask);
+ free_cpumask_var(rd->rto_mask);
+ free_cpumask_var(rd->online);
+ free_cpumask_var(rd->span);
+ kfree(rd);
+}
+
+void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ struct root_domain *old_rd = NULL;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ if (rq->rd) {
+ old_rd = rq->rd;
+
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+ /*
+ * If we dont want to free the old_rd yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ set_rq_online(rq);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ if (old_rd)
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+ memset(rd, 0, sizeof(*rd));
+
+ if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+ goto out;
+ if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+ goto free_span;
+ if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+ goto free_online;
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_rto_mask;
+
+ if (cpupri_init(&rd->cpupri) != 0)
+ goto free_cpudl;
+ return 0;
+
+free_cpudl:
+ cpudl_cleanup(&rd->cpudl);
+free_rto_mask:
+ free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+ free_cpumask_var(rd->dlo_mask);
+free_online:
+ free_cpumask_var(rd->online);
+free_span:
+ free_cpumask_var(rd->span);
+out:
+ return -ENOMEM;
+}
+
+/*
+ * By default the system creates a single root-domain with all CPUs as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
+
+void init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ if (init_rootdomain(rd) != 0) {
+ kfree(rd);
+ return NULL;
+ }
+
+ return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
+
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd)
+{
+ /*
+ * If its an overlapping domain it has private groups, iterate and
+ * nuke them all.
+ */
+ if (sd->flags & SD_OVERLAP) {
+ free_sched_groups(sd->groups, 1);
+ } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ kfree(sd->groups->sgc);
+ kfree(sd->groups);
+ }
+ if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+ kfree(sd->shared);
+ kfree(sd);
+}
+
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ while (sd) {
+ struct sched_domain *parent = sd->parent;
+ destroy_sched_domain(sd);
+ sd = parent;
+ }
+}
+
+static void destroy_sched_domains(struct sched_domain *sd)
+{
+ if (sd)
+ call_rcu(&sd->rcu, destroy_sched_domains_rcu);
+}
+
+/*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first CPU number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two CPUs are in the same cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
+DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+
+static void update_top_cache_domain(int cpu)
+{
+ struct sched_domain_shared *sds = NULL;
+ struct sched_domain *sd;
+ int id = cpu;
+ int size = 1;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ if (sd) {
+ id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ sds = sd->shared;
+ }
+
+ rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
+ per_cpu(sd_llc_id, cpu) = id;
+ rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+
+ if (sd_parent_degenerate(tmp, parent)) {
+ tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
+ destroy_sched_domain(parent);
+ } else
+ tmp = tmp->parent;
+ }
+
+ if (sd && sd_degenerate(sd)) {
+ tmp = sd;
+ sd = sd->parent;
+ destroy_sched_domain(tmp);
+ if (sd)
+ sd->child = NULL;
+ }
+
+ sched_domain_debug(sd, cpu);
+
+ rq_attach_root(rq, rd);
+ tmp = rq->sd;
+ rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp);
+
+ update_top_cache_domain(cpu);
+}
+
+/* Setup the mask of CPUs configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+ int ret;
+
+ alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ ret = cpulist_parse(str, cpu_isolated_map);
+ if (ret) {
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ return 0;
+ }
+ return 1;
+}
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+ struct sched_domain ** __percpu sd;
+ struct root_domain *rd;
+};
+
+enum s_alloc {
+ sa_rootdomain,
+ sa_sd,
+ sa_sd_storage,
+ sa_none,
+};
+
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * CPU they're built on, so check that.
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
+
+/*
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sibling = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_cpus(sg);
+ if (sibling->child)
+ cpumask_copy(sg_span, sched_domain_span(sibling->child));
+ else
+ cpumask_set_cpu(i, sg_span);
+
+ cpumask_or(covered, covered, sg_span);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ build_group_mask(sd, sg);
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance CPU. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ group_balance_cpu(sg) == cpu)
+ groups = sg;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = groups;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+{
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
+
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
+
+ if (sg) {
+ *sg = *per_cpu_ptr(sdd->sg, cpu);
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+ /* For claim_allocations: */
+ atomic_set(&(*sg)->sgc->ref, 1);
+ }
+
+ return cpu;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_capacity to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
+
+ get_group(cpu, sdd, &sd->groups);
+ atomic_inc(&sd->groups->ref);
+
+ if (cpu != cpumask_first(span))
+ return 0;
+
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct sched_group *sg;
+ int group, j;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ group = get_group(i, sdd, &sg);
+ cpumask_setall(sched_group_mask(sg));
+
+ for_each_cpu(j, span) {
+ if (get_group(j, sdd, NULL) != group)
+ continue;
+
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
+ }
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ }
+ last->next = first;
+
+ return 0;
+}
+
+/*
+ * Initialize sched groups cpu_capacity.
+ *
+ * cpu_capacity indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
+ */
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+{
+ struct sched_group *sg = sd->groups;
+
+ WARN_ON(!sg);
+
+ do {
+ int cpu, max_cpu = -1;
+
+ sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ goto next;
+
+ for_each_cpu(cpu, sched_group_cpus(sg)) {
+ if (max_cpu < 0)
+ max_cpu = cpu;
+ else if (sched_asym_prefer(cpu, max_cpu))
+ max_cpu = cpu;
+ }
+ sg->asym_prefer_cpu = max_cpu;
+
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_balance_cpu(sg))
+ return;
+
+ update_group_capacity(sd, cpu);
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
+
+ return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+ struct sched_domain_attr *attr)
+{
+ int request;
+
+ if (!attr || attr->relax_domain_level < 0) {
+ if (default_relax_domain_level < 0)
+ return;
+ else
+ request = default_relax_domain_level;
+ } else
+ request = attr->relax_domain_level;
+ if (request < sd->level) {
+ /* Turn off idle balance on this domain: */
+ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ } else {
+ /* Turn on idle balance on this domain: */
+ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ }
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ const struct cpumask *cpu_map)
+{
+ switch (what) {
+ case sa_rootdomain:
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu);
+ /* Fall through */
+ case sa_sd:
+ free_percpu(d->sd);
+ /* Fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map);
+ /* Fall through */
+ case sa_none:
+ break;
+ }
+}
+
+static enum s_alloc
+__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
+{
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
+ d->rd = alloc_rootdomain();
+ if (!d->rd)
+ return sa_sd;
+ return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+ struct sd_data *sdd = sd->private;
+
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+ *per_cpu_ptr(sdd->sds, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
+static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
+ SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map,
+ struct sched_domain *child, int cpu)
+{
+ struct sd_data *sdd = &tl->data;
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ int sd_id, sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+ .child = child,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
+ };
+
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sched_domain_span(sd));
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ struct sched_domain *t = sd;
+
+ for_each_lower_domain(t)
+ t->flags |= SD_BALANCE_WAKE;
+ }
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ /*
+ * For all levels sharing cache; connect a sched_domain_shared
+ * instance.
+ */
+ if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+ }
+
+ sd->private = sdd;
+
+ return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+ default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ if (WARN_ON_ONCE(sched_smp_initialized))
+ return;
+
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (sched_domains_numa_levels <= 1) {
+ sched_numa_topology_type = NUMA_DIRECT;
+ return;
+ }
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
+void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
+ }
+
+ if (!level)
+ return;
+
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * CPUs of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for_each_node(k) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ SD_INIT_NAME(NUMA)
+ };
+ }
+
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+ sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+ init_numa_topology_type();
+}
+
+void sched_domains_numa_masks_set(unsigned int cpu)
+{
+ int node = cpu_to_node(cpu);
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+void sched_domains_numa_masks_clear(unsigned int cpu)
+{
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sds = alloc_percpu(struct sched_domain_shared *);
+ if (!sdd->sds)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_domain_shared *sds;
+ struct sched_group *sg;
+ struct sched_group_capacity *sgc;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sds)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sds, j) = sds;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ sg->next = sg;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgc)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sds)
+ kfree(*per_cpu_ptr(sdd->sds, j));
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
+ }
+ free_percpu(sdd->sd);
+ sdd->sd = NULL;
+ free_percpu(sdd->sds);
+ sdd->sds = NULL;
+ free_percpu(sdd->sg);
+ sdd->sg = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
+ }
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
+{
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child cpus. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
+ }
+ set_domain_attribute(sd, attr);
+
+ return sd;
+}
+
+/*
+ * Build sched domains for a given set of CPUs and attach the sched domains
+ * to the individual CPUs
+ */
+static int
+build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+{
+ enum s_alloc alloc_state;
+ struct sched_domain *sd;
+ struct s_data d;
+ struct rq *rq = NULL;
+ int i, ret = -ENOMEM;
+
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ if (alloc_state != sa_rootdomain)
+ goto error;
+
+ /* Set up domains for CPUs specified by the cpu_map: */
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain_topology_level *tl;
+
+ sd = NULL;
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
+ if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
+ }
+
+ /* Build the groups for the domains */
+ for_each_cpu(i, cpu_map) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
+ }
+ }
+
+ /* Calculate CPU capacity for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_capacity(i, sd);
+ }
+ }
+
+ /* Attach the domains */
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map) {
+ rq = cpu_rq(i);
+ sd = *per_cpu_ptr(d.sd, i);
+
+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
+ cpu_attach_domain(sd, d.rd, i);
+ }
+ rcu_read_unlock();
+
+ if (rq && sched_debug_enabled) {
+ pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+ }
+
+ ret = 0;
+error:
+ __free_domain_allocs(&d, alloc_state, cpu_map);
+ return ret;
+}
+
+/* Current sched domains: */
+static cpumask_var_t *doms_cur;
+
+/* Number of sched domains in 'doms_cur': */
+static int ndoms_cur;
+
+/* Attribues of custom domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+
+/*
+ * Special case: If a kmalloc() of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * CPU core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+ return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+ int i;
+ cpumask_var_t *doms;
+
+ doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+ if (!doms)
+ return NULL;
+ for (i = 0; i < ndoms; i++) {
+ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+ free_sched_domains(doms, i);
+ return NULL;
+ }
+ }
+ return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+ unsigned int i;
+ for (i = 0; i < ndoms; i++)
+ free_cpumask_var(doms[i]);
+ kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated CPUs, but could be used to
+ * exclude other special cases in the future.
+ */
+int init_sched_domains(const struct cpumask *cpu_map)
+{
+ int err;
+
+ arch_update_cpu_topology();
+ ndoms_cur = 1;
+ doms_cur = alloc_sched_domains(ndoms_cur);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+ err = build_sched_domains(doms_cur[0], NULL);
+ register_sched_domain_sysctl();
+
+ return err;
+}
+
+/*
+ * Detach sched domains from a group of CPUs specified in cpu_map
+ * These CPUs will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+ int i;
+
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map)
+ cpu_attach_domain(NULL, &def_root_domain, i);
+ rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+ struct sched_domain_attr *new, int idx_new)
+{
+ struct sched_domain_attr tmp;
+
+ /* Fast path: */
+ if (!new && !cur)
+ return 1;
+
+ tmp = SD_ATTR_INIT;
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
+ new ? (new + idx_new) : &tmp,
+ sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains. This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ int i, j, n;
+ int new_topology;
+
+ mutex_lock(&sched_domains_mutex);
+
+ /* Always unregister in case we don't destroy any domains: */
+ unregister_sched_domain_sysctl();
+
+ /* Let the architecture update CPU core mappings: */
+ new_topology = arch_update_cpu_topology();
+
+ n = doms_new ? ndoms_new : 0;
+
+ /* Destroy deleted domains: */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_cur[i], doms_new[j])
+ && dattrs_equal(dattr_cur, i, dattr_new, j))
+ goto match1;
+ }
+ /* No match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur[i]);
+match1:
+ ;
+ }
+
+ n = ndoms_cur;
+ if (doms_new == NULL) {
+ n = 0;
+ doms_new = &fallback_doms;
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ WARN_ON_ONCE(dattr_new);
+ }
+
+ /* Build new domains: */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j])
+ && dattrs_equal(dattr_new, i, dattr_cur, j))
+ goto match2;
+ }
+ /* No match - add a new doms_new */
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+ ;
+ }
+
+ /* Remember the new sched domains: */
+ if (doms_cur != &fallback_doms)
+ free_sched_domains(doms_cur, ndoms_cur);
+
+ kfree(dattr_cur);
+ doms_cur = doms_new;
+ dattr_cur = dattr_new;
+ ndoms_cur = ndoms_new;
+
+ register_sched_domain_sysctl();
+
+ mutex_unlock(&sched_domains_mutex);
+}
+
diff --git a/kernel/signal.c b/kernel/signal.c
index 3603d93a1968..13f9def8b24a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1581,7 +1581,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
unsigned long flags;
struct sighand_struct *psig;
bool autoreap = false;
- cputime_t utime, stime;
+ u64 utime, stime;
BUG_ON(sig == -1);
@@ -1620,8 +1620,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
- info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
- info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
+ info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
+ info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
@@ -1685,7 +1685,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
- cputime_t utime, stime;
+ u64 utime, stime;
if (for_ptracer) {
parent = tsk->parent;
@@ -1705,8 +1705,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
- info.si_utime = cputime_to_clock_t(utime);
- info.si_stime = cputime_to_clock_t(stime);
+ info.si_utime = nsec_to_clock_t(utime);
+ info.si_stime = nsec_to_clock_t(stime);
info.si_code = why;
switch (why) {
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index b6e4c16377c7..9c15a9124e83 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -18,10 +18,8 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
if (WARN_ON(!trace->entries))
return;
- for (i = 0; i < trace->nr_entries; i++) {
- printk("%*c", 1 + spaces, ' ');
- print_ip_sym(trace->entries[i]);
- }
+ for (i = 0; i < trace->nr_entries; i++)
+ printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]);
}
EXPORT_SYMBOL_GPL(print_stack_trace);
@@ -29,7 +27,6 @@ int snprint_stack_trace(char *buf, size_t size,
struct stack_trace *trace, int spaces)
{
int i;
- unsigned long ip;
int generated;
int total = 0;
@@ -37,9 +34,8 @@ int snprint_stack_trace(char *buf, size_t size,
return 0;
for (i = 0; i < trace->nr_entries; i++) {
- ip = trace->entries[i];
- generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
- 1 + spaces, ' ', (void *) ip, (void *) ip);
+ generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
+ (void *)trace->entries[i]);
total += generated;
diff --git a/kernel/sys.c b/kernel/sys.c
index 842914ef7de4..7d4a9a6df956 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -881,15 +881,15 @@ SYSCALL_DEFINE0(getegid)
void do_sys_times(struct tms *tms)
{
- cputime_t tgutime, tgstime, cutime, cstime;
+ u64 tgutime, tgstime, cutime, cstime;
thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
- tms->tms_utime = cputime_to_clock_t(tgutime);
- tms->tms_stime = cputime_to_clock_t(tgstime);
- tms->tms_cutime = cputime_to_clock_t(cutime);
- tms->tms_cstime = cputime_to_clock_t(cstime);
+ tms->tms_utime = nsec_to_clock_t(tgutime);
+ tms->tms_stime = nsec_to_clock_t(tgstime);
+ tms->tms_cutime = nsec_to_clock_t(cutime);
+ tms->tms_cstime = nsec_to_clock_t(cstime);
}
SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
@@ -1544,7 +1544,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
{
struct task_struct *t;
unsigned long flags;
- cputime_t tgutime, tgstime, utime, stime;
+ u64 tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
memset((char *)r, 0, sizeof (*r));
@@ -1600,8 +1600,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
unlock_task_sighand(p, &flags);
out:
- cputime_to_timeval(utime, &r->ru_utime);
- cputime_to_timeval(stime, &r->ru_stime);
+ r->ru_utime = ns_to_timeval(utime);
+ r->ru_stime = ns_to_timeval(stime);
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8dbaec0e4f7f..bb260ceb3718 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -416,7 +416,7 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "sched_rr_timeslice_ms",
- .data = &sched_rr_timeslice,
+ .data = &sysctl_sched_rr_timeslice,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rr_handler,
@@ -2475,6 +2475,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
break;
if (neg)
continue;
+ val = convmul * val / convdiv;
if ((min && val < *min) || (max && val > *max))
continue;
*i = val;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 976840d29a71..938dbf33ef49 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -15,6 +15,5 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
-obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 665985b0a89a..93621ae718d3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
+
+ if (cs->mark_unstable)
+ cs->mark_unstable(cs);
+
if (finished_booting)
schedule_work(&watchdog_work);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index c6ecedd3b839..8e11d8d9f419 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -94,17 +94,15 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ /* Make sure we catch unsupported clockids */
+ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
+
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
-static inline int hrtimer_clockid_to_base(clockid_t clock_id)
-{
- return hrtimer_clock_to_base_table[clock_id];
-}
-
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -766,34 +764,6 @@ void hrtimers_resume(void)
clock_was_set_delayed();
}
-static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
- if (timer->start_site)
- return;
- timer->start_site = __builtin_return_address(0);
- memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
- timer->start_pid = current->pid;
-#endif
-}
-
-static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
- timer->start_site = NULL;
-#endif
-}
-
-static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
- if (likely(!timer_stats_active))
- return;
- timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
- timer->function, timer->start_comm, 0);
-#endif
-}
-
/*
* Counterpart to lock_hrtimer_base above:
*/
@@ -932,7 +902,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
* rare case and less expensive than a smp call.
*/
debug_deactivate(timer);
- timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
if (!restart)
@@ -990,8 +959,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Switch the timer base, if necessary: */
new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
- timer_stats_hrtimer_set_start_info(timer);
-
leftmost = enqueue_hrtimer(timer, new_base);
if (!leftmost)
goto unlock;
@@ -1112,6 +1079,18 @@ u64 hrtimer_get_next_event(void)
}
#endif
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+ if (likely(clock_id < MAX_CLOCKS)) {
+ int base = hrtimer_clock_to_base_table[clock_id];
+
+ if (likely(base != HRTIMER_MAX_CLOCK_BASES))
+ return base;
+ }
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return HRTIMER_BASE_MONOTONIC;
+}
+
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
@@ -1128,12 +1107,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
base = hrtimer_clockid_to_base(clock_id);
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
-
-#ifdef CONFIG_TIMER_STATS
- timer->start_site = NULL;
- timer->start_pid = -1;
- memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
}
/**
@@ -1217,7 +1190,6 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
raw_write_seqcount_barrier(&cpu_base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
- timer_stats_account_hrtimer(timer);
fn = timer->function;
/*
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 8c89143f9ebf..a95f13c31464 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -45,16 +45,16 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
struct itimerval *const value)
{
- cputime_t cval, cinterval;
+ u64 val, interval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
spin_lock_irq(&tsk->sighand->siglock);
- cval = it->expires;
- cinterval = it->incr;
- if (cval) {
+ val = it->expires;
+ interval = it->incr;
+ if (val) {
struct task_cputime cputime;
- cputime_t t;
+ u64 t;
thread_group_cputimer(tsk, &cputime);
if (clock_id == CPUCLOCK_PROF)
@@ -63,17 +63,17 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
/* CPUCLOCK_VIRT */
t = cputime.utime;
- if (cval < t)
+ if (val < t)
/* about to fire */
- cval = cputime_one_jiffy;
+ val = TICK_NSEC;
else
- cval = cval - t;
+ val -= t;
}
spin_unlock_irq(&tsk->sighand->siglock);
- cputime_to_timeval(cval, &value->it_value);
- cputime_to_timeval(cinterval, &value->it_interval);
+ value->it_value = ns_to_timeval(val);
+ value->it_interval = ns_to_timeval(interval);
}
int do_getitimer(int which, struct itimerval *value)
@@ -129,55 +129,35 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
-{
- struct timespec ts;
- s64 cpu_ns;
-
- cputime_to_timespec(ct, &ts);
- cpu_ns = timespec_to_ns(&ts);
-
- return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
-}
-
static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
const struct itimerval *const value,
struct itimerval *const ovalue)
{
- cputime_t cval, nval, cinterval, ninterval;
- s64 ns_ninterval, ns_nval;
- u32 error, incr_error;
+ u64 oval, nval, ointerval, ninterval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
- nval = timeval_to_cputime(&value->it_value);
- ns_nval = timeval_to_ns(&value->it_value);
- ninterval = timeval_to_cputime(&value->it_interval);
- ns_ninterval = timeval_to_ns(&value->it_interval);
-
- error = cputime_sub_ns(nval, ns_nval);
- incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+ nval = timeval_to_ns(&value->it_value);
+ ninterval = timeval_to_ns(&value->it_interval);
spin_lock_irq(&tsk->sighand->siglock);
- cval = it->expires;
- cinterval = it->incr;
- if (cval || nval) {
+ oval = it->expires;
+ ointerval = it->incr;
+ if (oval || nval) {
if (nval > 0)
- nval += cputime_one_jiffy;
- set_process_cpu_timer(tsk, clock_id, &nval, &cval);
+ nval += TICK_NSEC;
+ set_process_cpu_timer(tsk, clock_id, &nval, &oval);
}
it->expires = nval;
it->incr = ninterval;
- it->error = error;
- it->incr_error = incr_error;
trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
- cputime_to_timeval(cval, &ovalue->it_value);
- cputime_to_timeval(cinterval, &ovalue->it_interval);
+ ovalue->it_value = ns_to_timeval(oval);
+ ovalue->it_interval = ns_to_timeval(ointerval);
}
}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a4a0e478e44d..7906b3f0c41a 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -27,19 +27,8 @@
#include "timekeeping.h"
-/* The Jiffies based clocksource is the lowest common
- * denominator clock source which should function on
- * all systems. It has the same coarse resolution as
- * the timer interrupt frequency HZ and it suffers
- * inaccuracies caused by missed or lost timer
- * interrupts and the inability for the timer
- * interrupt hardware to accuratly tick at the
- * requested HZ value. It is also not recommended
- * for "tick-less" systems.
- */
-#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
-/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+/* Since jiffies uses a simple TICK_NSEC multiplier
* conversion, the .shift value could be zero. However
* this would make NTP adjustments impossible as they are
* in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
@@ -47,8 +36,8 @@
* amount, and give ntp adjustments in units of 1/2^8
*
* The value 8 is somewhat carefully chosen, as anything
- * larger can result in overflows. NSEC_PER_JIFFY grows as
- * HZ shrinks, so values greater than 8 overflow 32bits when
+ * larger can result in overflows. TICK_NSEC grows as HZ
+ * shrinks, so values greater than 8 overflow 32bits when
* HZ=100.
*/
#if HZ < 34
@@ -64,12 +53,23 @@ static u64 jiffies_read(struct clocksource *cs)
return (u64) jiffies;
}
+/*
+ * The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not recommended
+ * for "tick-less" systems.
+ */
static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
.read = jiffies_read,
.mask = CLOCKSOURCE_MASK(32),
- .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+ .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
.max_cycles = 10,
};
@@ -125,7 +125,7 @@ int register_refined_jiffies(long cycles_per_second)
shift_hz += cycles_per_tick/2;
do_div(shift_hz, cycles_per_tick);
/* Calculate nsec_per_tick using shift_hz */
- nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+ nsec_per_tick = (u64)TICK_NSEC << 8;
nsec_per_tick += (u32)shift_hz/2;
do_div(nsec_per_tick, (u32)shift_hz);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e9e8c10f0d9a..b4377a5e4269 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -20,10 +20,10 @@
*/
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
- cputime_t cputime = secs_to_cputime(rlim_new);
+ u64 nsecs = rlim_new * NSEC_PER_SEC;
spin_lock_irq(&task->sighand->siglock);
- set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
+ set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
spin_unlock_irq(&task->sighand->siglock);
}
@@ -50,39 +50,14 @@ static int check_clock(const clockid_t which_clock)
return error;
}
-static inline unsigned long long
-timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
-{
- unsigned long long ret;
-
- ret = 0; /* high half always zero when .cpu used */
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
- } else {
- ret = cputime_to_expires(timespec_to_cputime(tp));
- }
- return ret;
-}
-
-static void sample_to_timespec(const clockid_t which_clock,
- unsigned long long expires,
- struct timespec *tp)
-{
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
- *tp = ns_to_timespec(expires);
- else
- cputime_to_timespec((__force cputime_t)expires, tp);
-}
-
/*
* Update expiry time from increment, and increase overrun count,
* given the current clock sample.
*/
-static void bump_cpu_timer(struct k_itimer *timer,
- unsigned long long now)
+static void bump_cpu_timer(struct k_itimer *timer, u64 now)
{
int i;
- unsigned long long delta, incr;
+ u64 delta, incr;
if (timer->it.cpu.incr == 0)
return;
@@ -122,21 +97,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
return 0;
}
-static inline unsigned long long prof_ticks(struct task_struct *p)
+static inline u64 prof_ticks(struct task_struct *p)
{
- cputime_t utime, stime;
+ u64 utime, stime;
task_cputime(p, &utime, &stime);
- return cputime_to_expires(utime + stime);
+ return utime + stime;
}
-static inline unsigned long long virt_ticks(struct task_struct *p)
+static inline u64 virt_ticks(struct task_struct *p)
{
- cputime_t utime, stime;
+ u64 utime, stime;
task_cputime(p, &utime, &stime);
- return cputime_to_expires(utime);
+ return utime;
}
static int
@@ -176,8 +151,8 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
/*
* Sample a per-thread clock for the given task.
*/
-static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
- unsigned long long *sample)
+static int cpu_clock_sample(const clockid_t which_clock,
+ struct task_struct *p, u64 *sample)
{
switch (CPUCLOCK_WHICH(which_clock)) {
default:
@@ -260,7 +235,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
*/
static int cpu_clock_sample_group(const clockid_t which_clock,
struct task_struct *p,
- unsigned long long *sample)
+ u64 *sample)
{
struct task_cputime cputime;
@@ -269,11 +244,11 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
return -EINVAL;
case CPUCLOCK_PROF:
thread_group_cputime(p, &cputime);
- *sample = cputime_to_expires(cputime.utime + cputime.stime);
+ *sample = cputime.utime + cputime.stime;
break;
case CPUCLOCK_VIRT:
thread_group_cputime(p, &cputime);
- *sample = cputime_to_expires(cputime.utime);
+ *sample = cputime.utime;
break;
case CPUCLOCK_SCHED:
thread_group_cputime(p, &cputime);
@@ -288,7 +263,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
struct timespec *tp)
{
int err = -EINVAL;
- unsigned long long rtn;
+ u64 rtn;
if (CPUCLOCK_PERTHREAD(which_clock)) {
if (same_thread_group(tsk, current))
@@ -299,7 +274,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
}
if (!err)
- sample_to_timespec(which_clock, rtn, tp);
+ *tp = ns_to_timespec(rtn);
return err;
}
@@ -453,7 +428,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
cleanup_timers(tsk->signal->cpu_timers);
}
-static inline int expires_gt(cputime_t expires, cputime_t new_exp)
+static inline int expires_gt(u64 expires, u64 new_exp)
{
return expires == 0 || expires > new_exp;
}
@@ -488,7 +463,7 @@ static void arm_timer(struct k_itimer *timer)
list_add(&nt->entry, listpos);
if (listpos == head) {
- unsigned long long exp = nt->expires;
+ u64 exp = nt->expires;
/*
* We are the new earliest-expiring POSIX 1.b timer, hence
@@ -499,16 +474,15 @@ static void arm_timer(struct k_itimer *timer)
switch (CPUCLOCK_WHICH(timer->it_clock)) {
case CPUCLOCK_PROF:
- if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
- cputime_expires->prof_exp = expires_to_cputime(exp);
+ if (expires_gt(cputime_expires->prof_exp, exp))
+ cputime_expires->prof_exp = exp;
break;
case CPUCLOCK_VIRT:
- if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
- cputime_expires->virt_exp = expires_to_cputime(exp);
+ if (expires_gt(cputime_expires->virt_exp, exp))
+ cputime_expires->virt_exp = exp;
break;
case CPUCLOCK_SCHED:
- if (cputime_expires->sched_exp == 0 ||
- cputime_expires->sched_exp > exp)
+ if (expires_gt(cputime_expires->sched_exp, exp))
cputime_expires->sched_exp = exp;
break;
}
@@ -559,8 +533,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
* traversal.
*/
static int cpu_timer_sample_group(const clockid_t which_clock,
- struct task_struct *p,
- unsigned long long *sample)
+ struct task_struct *p, u64 *sample)
{
struct task_cputime cputime;
@@ -569,10 +542,10 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
default:
return -EINVAL;
case CPUCLOCK_PROF:
- *sample = cputime_to_expires(cputime.utime + cputime.stime);
+ *sample = cputime.utime + cputime.stime;
break;
case CPUCLOCK_VIRT:
- *sample = cputime_to_expires(cputime.utime);
+ *sample = cputime.utime;
break;
case CPUCLOCK_SCHED:
*sample = cputime.sum_exec_runtime;
@@ -593,12 +566,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
unsigned long flags;
struct sighand_struct *sighand;
struct task_struct *p = timer->it.cpu.task;
- unsigned long long old_expires, new_expires, old_incr, val;
+ u64 old_expires, new_expires, old_incr, val;
int ret;
WARN_ON_ONCE(p == NULL);
- new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+ new_expires = timespec_to_ns(&new->it_value);
/*
* Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -659,9 +632,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
bump_cpu_timer(timer, val);
if (val < timer->it.cpu.expires) {
old_expires = timer->it.cpu.expires - val;
- sample_to_timespec(timer->it_clock,
- old_expires,
- &old->it_value);
+ old->it_value = ns_to_timespec(old_expires);
} else {
old->it_value.tv_nsec = 1;
old->it_value.tv_sec = 0;
@@ -699,8 +670,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
*/
- timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
- &new->it_interval);
+ timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
/*
* This acts as a modification timestamp for the timer,
@@ -723,17 +693,15 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
ret = 0;
out:
- if (old) {
- sample_to_timespec(timer->it_clock,
- old_incr, &old->it_interval);
- }
+ if (old)
+ old->it_interval = ns_to_timespec(old_incr);
return ret;
}
static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
{
- unsigned long long now;
+ u64 now;
struct task_struct *p = timer->it.cpu.task;
WARN_ON_ONCE(p == NULL);
@@ -741,8 +709,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
/*
* Easy part: convert the reload time.
*/
- sample_to_timespec(timer->it_clock,
- timer->it.cpu.incr, &itp->it_interval);
+ itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -771,8 +738,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
* Call the timer disarmed, nothing else to do.
*/
timer->it.cpu.expires = 0;
- sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
- &itp->it_value);
+ itp->it_value = ns_to_timespec(timer->it.cpu.expires);
return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -781,9 +747,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
}
if (now < timer->it.cpu.expires) {
- sample_to_timespec(timer->it_clock,
- timer->it.cpu.expires - now,
- &itp->it_value);
+ itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
} else {
/*
* The timer should have expired already, but the firing
@@ -827,7 +791,7 @@ static void check_thread_timers(struct task_struct *tsk,
struct list_head *timers = tsk->cpu_timers;
struct signal_struct *const sig = tsk->signal;
struct task_cputime *tsk_expires = &tsk->cputime_expires;
- unsigned long long expires;
+ u64 expires;
unsigned long soft;
/*
@@ -838,10 +802,10 @@ static void check_thread_timers(struct task_struct *tsk,
return;
expires = check_timers_list(timers, firing, prof_ticks(tsk));
- tsk_expires->prof_exp = expires_to_cputime(expires);
+ tsk_expires->prof_exp = expires;
expires = check_timers_list(++timers, firing, virt_ticks(tsk));
- tsk_expires->virt_exp = expires_to_cputime(expires);
+ tsk_expires->virt_exp = expires;
tsk_expires->sched_exp = check_timers_list(++timers, firing,
tsk->se.sum_exec_runtime);
@@ -890,26 +854,17 @@ static inline void stop_process_timers(struct signal_struct *sig)
tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
}
-static u32 onecputick;
-
static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
- unsigned long long *expires,
- unsigned long long cur_time, int signo)
+ u64 *expires, u64 cur_time, int signo)
{
if (!it->expires)
return;
if (cur_time >= it->expires) {
- if (it->incr) {
+ if (it->incr)
it->expires += it->incr;
- it->error += it->incr_error;
- if (it->error >= onecputick) {
- it->expires -= cputime_one_jiffy;
- it->error -= onecputick;
- }
- } else {
+ else
it->expires = 0;
- }
trace_itimer_expire(signo == SIGPROF ?
ITIMER_PROF : ITIMER_VIRTUAL,
@@ -917,9 +872,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
}
- if (it->expires && (!*expires || it->expires < *expires)) {
+ if (it->expires && (!*expires || it->expires < *expires))
*expires = it->expires;
- }
}
/*
@@ -931,8 +885,8 @@ static void check_process_timers(struct task_struct *tsk,
struct list_head *firing)
{
struct signal_struct *const sig = tsk->signal;
- unsigned long long utime, ptime, virt_expires, prof_expires;
- unsigned long long sum_sched_runtime, sched_expires;
+ u64 utime, ptime, virt_expires, prof_expires;
+ u64 sum_sched_runtime, sched_expires;
struct list_head *timers = sig->cpu_timers;
struct task_cputime cputime;
unsigned long soft;
@@ -954,8 +908,8 @@ static void check_process_timers(struct task_struct *tsk,
* Collect the current process totals.
*/
thread_group_cputimer(tsk, &cputime);
- utime = cputime_to_expires(cputime.utime);
- ptime = utime + cputime_to_expires(cputime.stime);
+ utime = cputime.utime;
+ ptime = utime + cputime.stime;
sum_sched_runtime = cputime.sum_exec_runtime;
prof_expires = check_timers_list(timers, firing, ptime);
@@ -971,10 +925,10 @@ static void check_process_timers(struct task_struct *tsk,
SIGVTALRM);
soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
- unsigned long psecs = cputime_to_secs(ptime);
+ unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
unsigned long hard =
READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
- cputime_t x;
+ u64 x;
if (psecs >= hard) {
/*
* At the hard limit, we just die.
@@ -993,14 +947,13 @@ static void check_process_timers(struct task_struct *tsk,
sig->rlim[RLIMIT_CPU].rlim_cur = soft;
}
}
- x = secs_to_cputime(soft);
- if (!prof_expires || x < prof_expires) {
+ x = soft * NSEC_PER_SEC;
+ if (!prof_expires || x < prof_expires)
prof_expires = x;
- }
}
- sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
- sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
+ sig->cputime_expires.prof_exp = prof_expires;
+ sig->cputime_expires.virt_exp = virt_expires;
sig->cputime_expires.sched_exp = sched_expires;
if (task_cputime_zero(&sig->cputime_expires))
stop_process_timers(sig);
@@ -1017,7 +970,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
struct sighand_struct *sighand;
unsigned long flags;
struct task_struct *p = timer->it.cpu.task;
- unsigned long long now;
+ u64 now;
WARN_ON_ONCE(p == NULL);
@@ -1214,9 +1167,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* The tsk->sighand->siglock must be held by the caller.
*/
void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
- cputime_t *newval, cputime_t *oldval)
+ u64 *newval, u64 *oldval)
{
- unsigned long long now;
+ u64 now;
WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1230,7 +1183,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
if (*oldval) {
if (*oldval <= now) {
/* Just about to fire. */
- *oldval = cputime_one_jiffy;
+ *oldval = TICK_NSEC;
} else {
*oldval -= now;
}
@@ -1310,7 +1263,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
/*
* We were interrupted by a signal.
*/
- sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+ *rqtp = ns_to_timespec(timer.it.cpu.expires);
error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
if (!error) {
/*
@@ -1476,15 +1429,10 @@ static __init int init_posix_cpu_timers(void)
.clock_get = thread_cpu_clock_get,
.timer_create = thread_cpu_timer_create,
};
- struct timespec ts;
posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
- cputime_to_timespec(cputime_one_jiffy, &ts);
- onecputick = ts.tv_nsec;
- WARN_ON(ts.tv_sec != 0);
-
return 0;
}
__initcall(init_posix_cpu_timers);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 3109204c87cc..987e496bb51a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -29,12 +29,13 @@
*/
static struct tick_device tick_broadcast_device;
-static cpumask_var_t tick_broadcast_mask;
-static cpumask_var_t tick_broadcast_on;
-static cpumask_var_t tmpmask;
-static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly;
+static cpumask_var_t tmpmask __cpumask_var_read_mostly;
static int tick_broadcast_forced;
+static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+
#ifdef CONFIG_TICK_ONESHOT
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -347,17 +348,16 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
*
* Called when the system enters a state where affected tick devices
* might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
- *
- * Called with interrupts disabled, so clockevents_lock is not
- * required here because the local clock event device cannot go away
- * under us.
*/
void tick_broadcast_control(enum tick_broadcast_mode mode)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
int cpu, bc_stopped;
+ unsigned long flags;
+ /* Protects also the local clockevent device. */
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
@@ -365,12 +365,11 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
* Is the device not affected by the powerstate ?
*/
if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
- return;
+ goto out;
if (!tick_device_is_functional(dev))
- return;
+ goto out;
- raw_spin_lock(&tick_broadcast_lock);
cpu = smp_processor_id();
bc = tick_broadcast_device.evtdev;
bc_stopped = cpumask_empty(tick_broadcast_mask);
@@ -420,7 +419,8 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
tick_broadcast_setup_oneshot(bc);
}
}
- raw_spin_unlock(&tick_broadcast_lock);
+out:
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
EXPORT_SYMBOL_GPL(tick_broadcast_control);
@@ -517,9 +517,9 @@ void tick_resume_broadcast(void)
#ifdef CONFIG_TICK_ONESHOT
-static cpumask_var_t tick_broadcast_oneshot_mask;
-static cpumask_var_t tick_broadcast_pending_mask;
-static cpumask_var_t tick_broadcast_force_mask;
+static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly;
/*
* Exposed for debugging: see timer_list.c
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a3a9a8a029dc..25bdd2504571 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -702,6 +702,16 @@ u64 nsec_to_clock_t(u64 x)
#endif
}
+u64 jiffies64_to_nsecs(u64 j)
+{
+#if !(NSEC_PER_SEC % HZ)
+ return (NSEC_PER_SEC / HZ) * j;
+# else
+ return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
+#endif
+}
+EXPORT_SYMBOL(jiffies64_to_nsecs);
+
/**
* nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
*
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index c48688904f9f..f83bbb81600b 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -98,6 +98,12 @@ define timeconst(hz) {
print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+
+ cd=gcd(hz,1000000000)
+ print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n"
+ print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n"
+ print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n"
print "\n"
print "#endif /* KERNEL_TIMECONST_H */\n"
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index db087d7e106d..95b258dd75db 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1275,27 +1275,8 @@ error: /* even if we error out, we forwarded the time, so call update */
}
EXPORT_SYMBOL(timekeeping_inject_offset);
-
-/**
- * timekeeping_get_tai_offset - Returns current TAI offset from UTC
- *
- */
-s32 timekeeping_get_tai_offset(void)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
- s32 ret;
-
- do {
- seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->tai_offset;
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- return ret;
-}
-
/**
- * __timekeeping_set_tai_offset - Lock free worker function
+ * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
*
*/
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
@@ -1305,24 +1286,6 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
}
/**
- * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
- *
- */
-void timekeeping_set_tai_offset(s32 tai_offset)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
- __timekeeping_set_tai_offset(tk, tai_offset);
- timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- clock_was_set();
-}
-
-/**
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 704f595ce83f..d0914676d4c5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -11,8 +11,6 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
extern int timekeeping_inject_offset(struct timespec *ts);
-extern s32 timekeeping_get_tai_offset(void);
-extern void timekeeping_set_tai_offset(s32 tai_offset);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index ca9fb800336b..38bc4d2208e8 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -75,7 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
int bin = min(fls(t->tv_sec), NUM_BINS-1);
sleep_time_bin[bin]++;
- pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec,
- t->tv_nsec / NSEC_PER_MSEC);
+ printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
+ (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ec33a6933eae..82a6bfa0c307 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -571,38 +571,6 @@ internal_add_timer(struct timer_base *base, struct timer_list *timer)
trigger_dyntick_cpu(base, timer);
}
-#ifdef CONFIG_TIMER_STATS
-void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
-{
- if (timer->start_site)
- return;
-
- timer->start_site = addr;
- memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
- timer->start_pid = current->pid;
-}
-
-static void timer_stats_account_timer(struct timer_list *timer)
-{
- void *site;
-
- /*
- * start_site can be concurrently reset by
- * timer_stats_timer_clear_start_info()
- */
- site = READ_ONCE(timer->start_site);
- if (likely(!site))
- return;
-
- timer_stats_update_stats(timer, timer->start_pid, site,
- timer->function, timer->start_comm,
- timer->flags);
-}
-
-#else
-static void timer_stats_account_timer(struct timer_list *timer) {}
-#endif
-
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
static struct debug_obj_descr timer_debug_descr;
@@ -789,11 +757,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
{
timer->entry.pprev = NULL;
timer->flags = flags | raw_smp_processor_id();
-#ifdef CONFIG_TIMER_STATS
- timer->start_site = NULL;
- timer->start_pid = -1;
- memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
@@ -1001,8 +964,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
base = lock_timer_base(timer, &flags);
}
- timer_stats_timer_set_start_info(timer);
-
ret = detach_if_pending(timer, base, false);
if (!ret && pending_only)
goto out_unlock;
@@ -1130,7 +1091,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
struct timer_base *new_base, *base;
unsigned long flags;
- timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
new_base = get_timer_cpu_base(timer->flags, cpu);
@@ -1176,7 +1136,6 @@ int del_timer(struct timer_list *timer)
debug_assert_init(timer);
- timer_stats_timer_clear_start_info(timer);
if (timer_pending(timer)) {
base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, true);
@@ -1204,10 +1163,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
base = lock_timer_base(timer, &flags);
- if (base->running_timer != timer) {
- timer_stats_timer_clear_start_info(timer);
+ if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true);
- }
+
spin_unlock_irqrestore(&base->lock, flags);
return ret;
@@ -1331,7 +1289,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
unsigned long data;
timer = hlist_entry(head->first, struct timer_list, entry);
- timer_stats_account_timer(timer);
base->running_timer = timer;
detach_timer(timer, true);
@@ -1868,7 +1825,6 @@ static void __init init_timer_cpus(void)
void __init init_timers(void)
{
init_timer_cpus();
- init_timer_stats();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index afe6cd1944fc..ff8d5c13d04b 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -62,21 +62,11 @@ static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
-#ifdef CONFIG_TIMER_STATS
- char tmp[TASK_COMM_LEN + 1];
-#endif
SEQ_printf(m, " #%d: ", idx);
print_name_offset(m, taddr);
SEQ_printf(m, ", ");
print_name_offset(m, timer->function);
SEQ_printf(m, ", S:%02x", timer->state);
-#ifdef CONFIG_TIMER_STATS
- SEQ_printf(m, ", ");
- print_name_offset(m, timer->start_site);
- memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
- tmp[TASK_COMM_LEN] = 0;
- SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
-#endif
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
@@ -127,7 +117,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .base: %pK\n", base);
SEQ_printf(m, " .index: %d\n", base->index);
- SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+ SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
SEQ_printf(m, " .get_time: ");
print_name_offset(m, base->get_time);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
deleted file mode 100644
index afddded947df..000000000000
--- a/kernel/time/timer_stats.c
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * kernel/time/timer_stats.c
- *
- * Collect timer usage statistics.
- *
- * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
- * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * timer_stats is based on timer_top, a similar functionality which was part of
- * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
- * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
- * on dynamic allocation of the statistics entries and linear search based
- * lookup combined with a global lock, rather than the static array, hash
- * and per-CPU locking which is used by timer_stats. It was written for the
- * pre hrtimer kernel code and therefore did not take hrtimers into account.
- * Nevertheless it provided the base for the timer_stats implementation and
- * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
- * for this effort.
- *
- * timer_top.c is
- * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
- * Written by Daniel Petrini <d.pensator@gmail.com>
- * timer_top.c was released under the GNU General Public License version 2
- *
- * We export the addresses and counting of timer functions being called,
- * the pid and cmdline from the owner process if applicable.
- *
- * Start/stop data collection:
- * # echo [1|0] >/proc/timer_stats
- *
- * Display the information collected so far:
- * # cat /proc/timer_stats
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-
-#include <linux/uaccess.h>
-
-/*
- * This is our basic unit of interest: a timer expiry event identified
- * by the timer, its start/expire functions and the PID of the task that
- * started the timer. We count the number of times an event happens:
- */
-struct entry {
- /*
- * Hash list:
- */
- struct entry *next;
-
- /*
- * Hash keys:
- */
- void *timer;
- void *start_func;
- void *expire_func;
- pid_t pid;
-
- /*
- * Number of timeout events:
- */
- unsigned long count;
- u32 flags;
-
- /*
- * We save the command-line string to preserve
- * this information past task exit:
- */
- char comm[TASK_COMM_LEN + 1];
-
-} ____cacheline_aligned_in_smp;
-
-/*
- * Spinlock protecting the tables - not taken during lookup:
- */
-static DEFINE_RAW_SPINLOCK(table_lock);
-
-/*
- * Per-CPU lookup locks for fast hash lookup:
- */
-static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
-
-/*
- * Mutex to serialize state changes with show-stats activities:
- */
-static DEFINE_MUTEX(show_mutex);
-
-/*
- * Collection status, active/inactive:
- */
-int __read_mostly timer_stats_active;
-
-/*
- * Beginning/end timestamps of measurement:
- */
-static ktime_t time_start, time_stop;
-
-/*
- * tstat entry structs only get allocated while collection is
- * active and never freed during that time - this simplifies
- * things quite a bit.
- *
- * They get freed when a new collection period is started.
- */
-#define MAX_ENTRIES_BITS 10
-#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
-
-static unsigned long nr_entries;
-static struct entry entries[MAX_ENTRIES];
-
-static atomic_t overflow_count;
-
-/*
- * The entries are in a hash-table, for fast lookup:
- */
-#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
-#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
-#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
-
-#define __tstat_hashfn(entry) \
- (((unsigned long)(entry)->timer ^ \
- (unsigned long)(entry)->start_func ^ \
- (unsigned long)(entry)->expire_func ^ \
- (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
-
-#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
-
-static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
-
-static void reset_entries(void)
-{
- nr_entries = 0;
- memset(entries, 0, sizeof(entries));
- memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
- atomic_set(&overflow_count, 0);
-}
-
-static struct entry *alloc_entry(void)
-{
- if (nr_entries >= MAX_ENTRIES)
- return NULL;
-
- return entries + nr_entries++;
-}
-
-static int match_entries(struct entry *entry1, struct entry *entry2)
-{
- return entry1->timer == entry2->timer &&
- entry1->start_func == entry2->start_func &&
- entry1->expire_func == entry2->expire_func &&
- entry1->pid == entry2->pid;
-}
-
-/*
- * Look up whether an entry matching this item is present
- * in the hash already. Must be called with irqs off and the
- * lookup lock held:
- */
-static struct entry *tstat_lookup(struct entry *entry, char *comm)
-{
- struct entry **head, *curr, *prev;
-
- head = tstat_hashentry(entry);
- curr = *head;
-
- /*
- * The fastpath is when the entry is already hashed,
- * we do this with the lookup lock held, but with the
- * table lock not held:
- */
- while (curr) {
- if (match_entries(curr, entry))
- return curr;
-
- curr = curr->next;
- }
- /*
- * Slowpath: allocate, set up and link a new hash entry:
- */
- prev = NULL;
- curr = *head;
-
- raw_spin_lock(&table_lock);
- /*
- * Make sure we have not raced with another CPU:
- */
- while (curr) {
- if (match_entries(curr, entry))
- goto out_unlock;
-
- prev = curr;
- curr = curr->next;
- }
-
- curr = alloc_entry();
- if (curr) {
- *curr = *entry;
- curr->count = 0;
- curr->next = NULL;
- memcpy(curr->comm, comm, TASK_COMM_LEN);
-
- smp_mb(); /* Ensure that curr is initialized before insert */
-
- if (prev)
- prev->next = curr;
- else
- *head = curr;
- }
- out_unlock:
- raw_spin_unlock(&table_lock);
-
- return curr;
-}
-
-/**
- * timer_stats_update_stats - Update the statistics for a timer.
- * @timer: pointer to either a timer_list or a hrtimer
- * @pid: the pid of the task which set up the timer
- * @startf: pointer to the function which did the timer setup
- * @timerf: pointer to the timer callback function of the timer
- * @comm: name of the process which set up the timer
- * @tflags: The flags field of the timer
- *
- * When the timer is already registered, then the event counter is
- * incremented. Otherwise the timer is registered in a free slot.
- */
-void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
- void *timerf, char *comm, u32 tflags)
-{
- /*
- * It doesn't matter which lock we take:
- */
- raw_spinlock_t *lock;
- struct entry *entry, input;
- unsigned long flags;
-
- if (likely(!timer_stats_active))
- return;
-
- lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
-
- input.timer = timer;
- input.start_func = startf;
- input.expire_func = timerf;
- input.pid = pid;
- input.flags = tflags;
-
- raw_spin_lock_irqsave(lock, flags);
- if (!timer_stats_active)
- goto out_unlock;
-
- entry = tstat_lookup(&input, comm);
- if (likely(entry))
- entry->count++;
- else
- atomic_inc(&overflow_count);
-
- out_unlock:
- raw_spin_unlock_irqrestore(lock, flags);
-}
-
-static void print_name_offset(struct seq_file *m, unsigned long addr)
-{
- char symname[KSYM_NAME_LEN];
-
- if (lookup_symbol_name(addr, symname) < 0)
- seq_printf(m, "<%p>", (void *)addr);
- else
- seq_printf(m, "%s", symname);
-}
-
-static int tstats_show(struct seq_file *m, void *v)
-{
- struct timespec64 period;
- struct entry *entry;
- unsigned long ms;
- long events = 0;
- ktime_t time;
- int i;
-
- mutex_lock(&show_mutex);
- /*
- * If still active then calculate up to now:
- */
- if (timer_stats_active)
- time_stop = ktime_get();
-
- time = ktime_sub(time_stop, time_start);
-
- period = ktime_to_timespec64(time);
- ms = period.tv_nsec / 1000000;
-
- seq_puts(m, "Timer Stats Version: v0.3\n");
- seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
- if (atomic_read(&overflow_count))
- seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
- seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
-
- for (i = 0; i < nr_entries; i++) {
- entry = entries + i;
- if (entry->flags & TIMER_DEFERRABLE) {
- seq_printf(m, "%4luD, %5d %-16s ",
- entry->count, entry->pid, entry->comm);
- } else {
- seq_printf(m, " %4lu, %5d %-16s ",
- entry->count, entry->pid, entry->comm);
- }
-
- print_name_offset(m, (unsigned long)entry->start_func);
- seq_puts(m, " (");
- print_name_offset(m, (unsigned long)entry->expire_func);
- seq_puts(m, ")\n");
-
- events += entry->count;
- }
-
- ms += period.tv_sec * 1000;
- if (!ms)
- ms = 1;
-
- if (events && period.tv_sec)
- seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
- events, events * 1000 / ms,
- (events * 1000000 / ms) % 1000);
- else
- seq_printf(m, "%ld total events\n", events);
-
- mutex_unlock(&show_mutex);
-
- return 0;
-}
-
-/*
- * After a state change, make sure all concurrent lookup/update
- * activities have stopped:
- */
-static void sync_access(void)
-{
- unsigned long flags;
- int cpu;
-
- for_each_online_cpu(cpu) {
- raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
-
- raw_spin_lock_irqsave(lock, flags);
- /* nothing */
- raw_spin_unlock_irqrestore(lock, flags);
- }
-}
-
-static ssize_t tstats_write(struct file *file, const char __user *buf,
- size_t count, loff_t *offs)
-{
- char ctl[2];
-
- if (count != 2 || *offs)
- return -EINVAL;
-
- if (copy_from_user(ctl, buf, count))
- return -EFAULT;
-
- mutex_lock(&show_mutex);
- switch (ctl[0]) {
- case '0':
- if (timer_stats_active) {
- timer_stats_active = 0;
- time_stop = ktime_get();
- sync_access();
- }
- break;
- case '1':
- if (!timer_stats_active) {
- reset_entries();
- time_start = ktime_get();
- smp_mb();
- timer_stats_active = 1;
- }
- break;
- default:
- count = -EINVAL;
- }
- mutex_unlock(&show_mutex);
-
- return count;
-}
-
-static int tstats_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, tstats_show, NULL);
-}
-
-static const struct file_operations tstats_fops = {
- .open = tstats_open,
- .read = seq_read,
- .write = tstats_write,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-void __init init_timer_stats(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
-}
-
-static int __init init_tstats_procfs(void)
-{
- struct proc_dir_entry *pe;
-
- pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
- if (!pe)
- return -ENOMEM;
- return 0;
-}
-__initcall(init_tstats_procfs);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 775569ec50d0..af344a1bf0d0 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -266,7 +266,7 @@ out:
static struct cpumask save_cpumask;
static bool disable_migrate;
-static void move_to_next_cpu(void)
+static void move_to_next_cpu(bool initmask)
{
static struct cpumask *current_mask;
int next_cpu;
@@ -275,7 +275,7 @@ static void move_to_next_cpu(void)
return;
/* Just pick the first CPU on first iteration */
- if (!current_mask) {
+ if (initmask) {
current_mask = &save_cpumask;
get_online_cpus();
cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
@@ -330,10 +330,12 @@ static void move_to_next_cpu(void)
static int kthread_fn(void *data)
{
u64 interval;
+ bool initmask = true;
while (!kthread_should_stop()) {
- move_to_next_cpu();
+ move_to_next_cpu(initmask);
+ initmask = false;
local_irq_disable();
get_sample();
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a133ecd741e4..7ad9e53ad174 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1372,7 +1372,7 @@ kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6)
return a1 + a2 + a3 + a4 + a5 + a6;
}
-static struct __init trace_event_file *
+static __init struct trace_event_file *
find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
{
struct trace_event_file *file;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f8e26ab963ed..5c21f0535056 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,7 +31,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
struct taskstats *stats, struct task_struct *tsk)
{
const struct cred *tcred;
- cputime_t utime, stime, utimescaled, stimescaled;
+ u64 utime, stime, utimescaled, stimescaled;
u64 delta;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -67,12 +67,12 @@ void bacct_add_tsk(struct user_namespace *user_ns,
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
- stats->ac_utime = cputime_to_usecs(utime);
- stats->ac_stime = cputime_to_usecs(stime);
+ stats->ac_utime = div_u64(utime, NSEC_PER_USEC);
+ stats->ac_stime = div_u64(stime, NSEC_PER_USEC);
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
- stats->ac_utimescaled = cputime_to_usecs(utimescaled);
- stats->ac_stimescaled = cputime_to_usecs(stimescaled);
+ stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC);
+ stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC);
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;
@@ -123,18 +123,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
#undef MB
static void __acct_update_integrals(struct task_struct *tsk,
- cputime_t utime, cputime_t stime)
+ u64 utime, u64 stime)
{
- cputime_t time, dtime;
- u64 delta;
+ u64 time, delta;
if (!likely(tsk->mm))
return;
time = stime + utime;
- dtime = time - tsk->acct_timexpd;
- /* Avoid division: cputime_t is often in nanoseconds already. */
- delta = cputime_to_nsecs(dtime);
+ delta = time - tsk->acct_timexpd;
if (delta < TICK_NSEC)
return;
@@ -155,7 +152,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
*/
void acct_update_integrals(struct task_struct *tsk)
{
- cputime_t utime, stime;
+ u64 utime, stime;
unsigned long flags;
local_irq_save(flags);
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 9d20d5dd298a..95c6336fc2b3 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -128,10 +128,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
- spin_lock(&ucounts_lock);
+ spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
if (!ucounts) {
- spin_unlock(&ucounts_lock);
+ spin_unlock_irq(&ucounts_lock);
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
@@ -141,7 +141,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
new->uid = uid;
atomic_set(&new->count, 0);
- spin_lock(&ucounts_lock);
+ spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
if (ucounts) {
kfree(new);
@@ -152,16 +152,18 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
}
if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
ucounts = NULL;
- spin_unlock(&ucounts_lock);
+ spin_unlock_irq(&ucounts_lock);
return ucounts;
}
static void put_ucounts(struct ucounts *ucounts)
{
+ unsigned long flags;
+
if (atomic_dec_and_test(&ucounts->count)) {
- spin_lock(&ucounts_lock);
+ spin_lock_irqsave(&ucounts_lock, flags);
hlist_del_init(&ucounts->node);
- spin_unlock(&ucounts_lock);
+ spin_unlock_irqrestore(&ucounts_lock, flags);
kfree(ucounts);
}
@@ -225,11 +227,10 @@ static __init int user_namespace_sysctl_init(void)
* properly.
*/
user_header = register_sysctl("user", empty);
+ kmemleak_ignore(user_header);
BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
return 0;
}
subsys_initcall(user_namespace_sysctl_init);
-
-
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d4b0fa01cae3..63177be0159e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -49,6 +49,8 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+
/*
* The 'watchdog_running' variable is set to 1 when the watchdog threads
* are registered/started and is set to 0 when the watchdog threads are
@@ -260,6 +262,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return HRTIMER_NORESTART;
+
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -467,12 +472,16 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;
+ atomic_set(&watchdog_park_in_progress, 1);
+
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
+ atomic_set(&watchdog_park_in_progress, 0);
+
return ret;
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 84016c8aee6b..12b8dd640786 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -84,6 +84,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return;
+
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1d9fb6543a66..072cbc9b175d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1523,8 +1523,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
return;
}
- timer_stats_timer_set_start_info(&dwork->timer);
-
dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;